
#pragma once

#include <torch/csrc/inductor/aoti_torch/c/shim.h>

#ifdef __cplusplus
extern "C" {
#endif

AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__fw_primal(AtenTensorHandle self, int64_t level, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__make_dual(AtenTensorHandle primal, AtenTensorHandle tangent, int64_t level, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__new_zeros_with_same_feature_meta(AtenTensorHandle self, AtenTensorHandle other, int64_t self_num_batch_dims, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__has_same_storage_numel(AtenTensorHandle self, AtenTensorHandle other, int32_t* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__assert_async(AtenTensorHandle self);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__assert_async_msg(AtenTensorHandle self, const char* assert_msg);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__assert_scalar(double self, const char* assert_msg);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__functional_assert_scalar(double self, const char* assert_msg, AtenTensorHandle dep_token, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__functional_assert_async_msg(AtenTensorHandle self, const char* assert_msg, AtenTensorHandle dep_token, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__print(const char* s);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_sym_constrain_range(double size, int64_t* min, int64_t* max);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_sym_constrain_range_for_size(double size, int64_t* min, int64_t* max);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__functional_sym_constrain_range(double size, int64_t* min, int64_t* max, AtenTensorHandle dep_token, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__functional_sym_constrain_range_for_size(double size, int64_t* min, int64_t* max, AtenTensorHandle dep_token, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__make_dep_token(int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, int32_t* memory_format, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_native_dropout(AtenTensorHandle input, double p, int32_t* train, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_abs_(AtenTensorHandle self, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_view_as_real(AtenTensorHandle self, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_view_as_complex(AtenTensorHandle self, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__conj(AtenTensorHandle self, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__conj_physical(AtenTensorHandle self, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__neg_view(AtenTensorHandle self, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__add_relu_Tensor(AtenTensorHandle self, AtenTensorHandle other, double alpha, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__add_relu__Tensor(AtenTensorHandle self, AtenTensorHandle other, double alpha, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__add_relu_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle other, double alpha);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__add_relu_Scalar(AtenTensorHandle self, double other, double alpha, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__add_relu__Scalar(AtenTensorHandle self, double other, double alpha, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_addmv_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle mat, AtenTensorHandle vec, double beta, double alpha);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_addr(AtenTensorHandle self, AtenTensorHandle vec1, AtenTensorHandle vec2, double beta, double alpha, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_addr_(AtenTensorHandle self, AtenTensorHandle vec1, AtenTensorHandle vec2, double beta, double alpha, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_addr_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle vec1, AtenTensorHandle vec2, double beta, double alpha);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_affine_grid_generator(AtenTensorHandle theta, const int64_t* size, int64_t size_len_, int32_t align_corners, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__is_all_true(AtenTensorHandle self, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__is_any_true(AtenTensorHandle self, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__test_functorch_fallback(AtenTensorHandle self, AtenTensorHandle other, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_all_dims(AtenTensorHandle self, const int64_t** dim, int64_t dim_len_, int32_t keepdim, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_all_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, int32_t keepdim);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_all_dims_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t** dim, int64_t dim_len_, int32_t keepdim);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_allclose(AtenTensorHandle self, AtenTensorHandle other, double rtol, double atol, int32_t equal_nan, int32_t* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_any_dims(AtenTensorHandle self, const int64_t** dim, int64_t dim_len_, int32_t keepdim, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_any_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, int32_t keepdim);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_any_dims_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t** dim, int64_t dim_len_, int32_t keepdim);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_arange(double end, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_arange_start(double start, double end, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_arange_start_step(double start, double end, double step, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_arange_out(AtenTensorHandle out, double end);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_arange_start_out(AtenTensorHandle out, double start, double end, double step);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_argmax_out(AtenTensorHandle out, AtenTensorHandle self, int64_t* dim, int32_t keepdim);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_argmin_out(AtenTensorHandle out, AtenTensorHandle self, int64_t* dim, int32_t keepdim);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_as_strided(AtenTensorHandle self, const int64_t* size, int64_t size_len_, const int64_t* stride, int64_t stride_len_, int64_t* storage_offset, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_as_strided_(AtenTensorHandle self, const int64_t* size, int64_t size_len_, const int64_t* stride, int64_t stride_len_, int64_t* storage_offset, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_baddbmm_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle batch1, AtenTensorHandle batch2, double beta, double alpha);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_bartlett_window(int64_t window_length, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_bartlett_window_periodic(int64_t window_length, int32_t periodic, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_binary_cross_entropy(AtenTensorHandle self, AtenTensorHandle target, AtenTensorHandle* weight, int64_t reduction, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_binary_cross_entropy_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle target, AtenTensorHandle* weight, int64_t reduction);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_binary_cross_entropy_backward(AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle target, AtenTensorHandle* weight, int64_t reduction, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_binary_cross_entropy_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle target, AtenTensorHandle* weight, int64_t reduction);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_binary_cross_entropy_with_logits(AtenTensorHandle self, AtenTensorHandle target, AtenTensorHandle* weight, AtenTensorHandle* pos_weight, int64_t reduction, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_bincount(AtenTensorHandle self, AtenTensorHandle* weights, int64_t minlength, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_copysign__Scalar(AtenTensorHandle self, double other, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__lazy_clone(AtenTensorHandle self, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_blackman_window(int64_t window_length, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_blackman_window_periodic(int64_t window_length, int32_t periodic, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_bmm_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle mat2);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_cat_out(AtenTensorHandle out, const AtenTensorHandle* tensors, int64_t tensors_len_, int64_t dim);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_block_diag(const AtenTensorHandle* tensors, int64_t tensors_len_, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_complex(AtenTensorHandle real, AtenTensorHandle imag, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_complex_out(AtenTensorHandle out, AtenTensorHandle real, AtenTensorHandle imag);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_polar(AtenTensorHandle abs, AtenTensorHandle angle, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_polar_out(AtenTensorHandle out, AtenTensorHandle abs, AtenTensorHandle angle);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_constant_pad_nd(AtenTensorHandle self, const int64_t* pad, int64_t pad_len_, double value, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_convolution(AtenTensorHandle input, AtenTensorHandle weight, AtenTensorHandle* bias, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t transposed, const int64_t* output_padding, int64_t output_padding_len_, int64_t groups, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_convolution_backward(AtenTensorHandle grad_output, AtenTensorHandle input, AtenTensorHandle weight, const int64_t** bias_sizes, int64_t bias_sizes_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t transposed, const int64_t* output_padding, int64_t output_padding_len_, int64_t groups, const int32_t* output_mask, int64_t output_mask_len_, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_convolution_overrideable(AtenTensorHandle input, AtenTensorHandle weight, AtenTensorHandle* bias, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t transposed, const int64_t* output_padding, int64_t output_padding_len_, int64_t groups, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_convolution_backward_overrideable(AtenTensorHandle grad_output, AtenTensorHandle input, AtenTensorHandle weight, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t transposed, const int64_t* output_padding, int64_t output_padding_len_, int64_t groups, const int32_t* output_mask, int64_t output_mask_len_, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__convolution(AtenTensorHandle input, AtenTensorHandle weight, AtenTensorHandle* bias, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t transposed, const int64_t* output_padding, int64_t output_padding_len_, int64_t groups, int32_t benchmark, int32_t deterministic, int32_t cudnn_enabled, int32_t allow_tf32, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_conv_tbc(AtenTensorHandle self, AtenTensorHandle weight, AtenTensorHandle bias, int64_t pad, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_copy(AtenTensorHandle self, AtenTensorHandle src, int32_t non_blocking, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_copy_(AtenTensorHandle self, AtenTensorHandle src, int32_t non_blocking, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_count_nonzero_dim_IntList(AtenTensorHandle self, const int64_t* dim, int64_t dim_len_, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_count_nonzero(AtenTensorHandle self, int64_t* dim, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_cummax(AtenTensorHandle self, int64_t dim, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_cummax_out(AtenTensorHandle values, AtenTensorHandle indices, AtenTensorHandle self, int64_t dim);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__cummax_helper(AtenTensorHandle self, AtenTensorHandle values, AtenTensorHandle indices, int64_t dim);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_cummin(AtenTensorHandle self, int64_t dim, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_cummin_out(AtenTensorHandle values, AtenTensorHandle indices, AtenTensorHandle self, int64_t dim);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__cummin_helper(AtenTensorHandle self, AtenTensorHandle values, AtenTensorHandle indices, int64_t dim);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_cumprod_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, int32_t* dtype);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_cumsum_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, int32_t* dtype);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__ctc_loss(AtenTensorHandle log_probs, AtenTensorHandle targets, const int64_t* input_lengths, int64_t input_lengths_len_, const int64_t* target_lengths, int64_t target_lengths_len_, int64_t blank, int32_t zero_infinity, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__ctc_loss_Tensor(AtenTensorHandle log_probs, AtenTensorHandle targets, AtenTensorHandle input_lengths, AtenTensorHandle target_lengths, int64_t blank, int32_t zero_infinity, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__ctc_loss_backward(AtenTensorHandle grad, AtenTensorHandle log_probs, AtenTensorHandle targets, const int64_t* input_lengths, int64_t input_lengths_len_, const int64_t* target_lengths, int64_t target_lengths_len_, AtenTensorHandle neg_log_likelihood, AtenTensorHandle log_alpha, int64_t blank, int32_t zero_infinity, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__ctc_loss_backward_Tensor(AtenTensorHandle grad, AtenTensorHandle log_probs, AtenTensorHandle targets, AtenTensorHandle input_lengths, AtenTensorHandle target_lengths, AtenTensorHandle neg_log_likelihood, AtenTensorHandle log_alpha, int64_t blank, int32_t zero_infinity, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_diag_embed(AtenTensorHandle self, int64_t offset, int64_t dim1, int64_t dim2, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_diagonal(AtenTensorHandle self, int64_t offset, int64_t dim1, int64_t dim2, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_diagonal_backward(AtenTensorHandle grad_output, const int64_t* input_sizes, int64_t input_sizes_len_, int64_t offset, int64_t dim1, int64_t dim2, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_dot(AtenTensorHandle self, AtenTensorHandle tensor, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_dot_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle tensor);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_vdot(AtenTensorHandle self, AtenTensorHandle other, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_vdot_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle other);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_embedding(AtenTensorHandle weight, AtenTensorHandle indices, int64_t padding_idx, int32_t scale_grad_by_freq, int32_t sparse, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_embedding_dense_backward(AtenTensorHandle grad_output, AtenTensorHandle indices, int64_t num_weights, int64_t padding_idx, int32_t scale_grad_by_freq, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_embedding_renorm_(AtenTensorHandle self, AtenTensorHandle indices, double max_norm, double norm_type, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__embedding_bag_forward_only(AtenTensorHandle weight, AtenTensorHandle indices, AtenTensorHandle offsets, int32_t scale_grad_by_freq, int64_t mode, int32_t sparse, AtenTensorHandle* per_sample_weights, int32_t include_last_offset, int64_t padding_idx, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__embedding_bag(AtenTensorHandle weight, AtenTensorHandle indices, AtenTensorHandle offsets, int32_t scale_grad_by_freq, int64_t mode, int32_t sparse, AtenTensorHandle* per_sample_weights, int32_t include_last_offset, int64_t padding_idx, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__embedding_bag_dense_backward(AtenTensorHandle grad, AtenTensorHandle indices, AtenTensorHandle offset2bag, AtenTensorHandle bag_size, AtenTensorHandle maximum_indices, int64_t num_weights, int32_t scale_grad_by_freq, int64_t mode, AtenTensorHandle* per_sample_weights, int64_t padding_idx, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__embedding_bag_per_sample_weights_backward(AtenTensorHandle grad, AtenTensorHandle weight, AtenTensorHandle indices, AtenTensorHandle offsets, AtenTensorHandle offset2bag, int64_t mode, int64_t padding_idx, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_empty_memory_format(const int64_t* size, int64_t size_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, int32_t* memory_format, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_empty_permuted(const int64_t* size, int64_t size_len_, const int64_t* physical_layout, int64_t physical_layout_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_new_empty(AtenTensorHandle self, const int64_t* size, int64_t size_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_new_empty_strided(AtenTensorHandle self, const int64_t* size, int64_t size_len_, const int64_t* stride, int64_t stride_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_new_full(AtenTensorHandle self, const int64_t* size, int64_t size_len_, double fill_value, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_new_zeros(AtenTensorHandle self, const int64_t* size, int64_t size_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_new_ones(AtenTensorHandle self, const int64_t* size, int64_t size_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__empty_affine_quantized(const int64_t* size, int64_t size_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, double scale, int64_t zero_point, int32_t* memory_format, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__empty_per_channel_affine_quantized(const int64_t* size, int64_t size_len_, AtenTensorHandle scales, AtenTensorHandle zero_points, int64_t axis, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, int32_t* memory_format, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_resize_(AtenTensorHandle self, const int64_t* size, int64_t size_len_, int32_t* memory_format, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_empty_like(AtenTensorHandle self, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, int32_t* memory_format, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_empty_strided(const int64_t* size, int64_t size_len_, const int64_t* stride, int64_t stride_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_expand(AtenTensorHandle self, const int64_t* size, int64_t size_len_, int32_t implicit, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_eye(int64_t n, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_eye_m(int64_t n, int64_t m, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_eye_out(AtenTensorHandle out, int64_t n);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_eye_m_out(AtenTensorHandle out, int64_t n, int64_t m);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_fill_Scalar(AtenTensorHandle self, double value, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_fill_Tensor(AtenTensorHandle self, AtenTensorHandle value, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_fill__Scalar(AtenTensorHandle self, double value, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_fill__Tensor(AtenTensorHandle self, AtenTensorHandle value, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_floor_divide(AtenTensorHandle self, AtenTensorHandle other, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_floor_divide__Tensor(AtenTensorHandle self, AtenTensorHandle other, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_floor_divide_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle other);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_floor_divide_Scalar(AtenTensorHandle self, double other, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_floor_divide__Scalar(AtenTensorHandle self, double other, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_full(const int64_t* size, int64_t size_len_, double fill_value, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_full_out(AtenTensorHandle out, const int64_t* size, int64_t size_len_, double fill_value);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_full_like(AtenTensorHandle self, double fill_value, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, int32_t* memory_format, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_from_file(const char* filename, int32_t* shared, int64_t* size, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_grid_sampler_2d(AtenTensorHandle input, AtenTensorHandle grid, int64_t interpolation_mode, int64_t padding_mode, int32_t align_corners, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_grid_sampler_2d_backward(AtenTensorHandle grad_output, AtenTensorHandle input, AtenTensorHandle grid, int64_t interpolation_mode, int64_t padding_mode, int32_t align_corners, const int32_t* output_mask, int64_t output_mask_len_, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__grid_sampler_2d_cpu_fallback(AtenTensorHandle input, AtenTensorHandle grid, int64_t interpolation_mode, int64_t padding_mode, int32_t align_corners, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_grid_sampler_3d(AtenTensorHandle input, AtenTensorHandle grid, int64_t interpolation_mode, int64_t padding_mode, int32_t align_corners, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_grid_sampler_3d_backward(AtenTensorHandle grad_output, AtenTensorHandle input, AtenTensorHandle grid, int64_t interpolation_mode, int64_t padding_mode, int32_t align_corners, const int32_t* output_mask, int64_t output_mask_len_, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_hann_window(int64_t window_length, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_hann_window_periodic(int64_t window_length, int32_t periodic, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_hamming_window(int64_t window_length, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_hamming_window_periodic(int64_t window_length, int32_t periodic, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_hamming_window_periodic_alpha(int64_t window_length, int32_t periodic, double alpha, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_hamming_window_periodic_alpha_beta(int64_t window_length, int32_t periodic, double alpha, double beta, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_kaiser_window(int64_t window_length, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_kaiser_window_periodic(int64_t window_length, int32_t periodic, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_kaiser_window_beta(int64_t window_length, int32_t periodic, double beta, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_native_group_norm(AtenTensorHandle input, AtenTensorHandle* weight, AtenTensorHandle* bias, int64_t N, int64_t C, int64_t HxW, int64_t group, double eps, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_native_group_norm_backward(AtenTensorHandle grad_out, AtenTensorHandle input, AtenTensorHandle mean, AtenTensorHandle rstd, AtenTensorHandle* weight, int64_t N, int64_t C, int64_t HxW, int64_t group, const int32_t* output_mask, int64_t output_mask_len_, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__fft_r2c(AtenTensorHandle self, const int64_t* dim, int64_t dim_len_, int64_t normalization, int32_t onesided, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__fft_r2c_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* dim, int64_t dim_len_, int64_t normalization, int32_t onesided);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__fft_c2r(AtenTensorHandle self, const int64_t* dim, int64_t dim_len_, int64_t normalization, int64_t last_dim_size, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__fft_c2r_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* dim, int64_t dim_len_, int64_t normalization, int64_t last_dim_size);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__fft_c2c(AtenTensorHandle self, const int64_t* dim, int64_t dim_len_, int64_t normalization, int32_t forward, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__fft_c2c_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* dim, int64_t dim_len_, int64_t normalization, int32_t forward);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__validate_compressed_sparse_indices(int32_t is_crow, AtenTensorHandle compressed_idx, AtenTensorHandle plain_idx, int64_t cdim, int64_t dim, int64_t nnz);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_index_Tensor_out(AtenTensorHandle out, AtenTensorHandle self, const AtenTensorHandle** indices, int64_t indices_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__unsafe_index_Tensor(AtenTensorHandle self, const AtenTensorHandle** indices, int64_t indices_len_, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_index_copy_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, AtenTensorHandle index, AtenTensorHandle source);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_index_put_(AtenTensorHandle self, const AtenTensorHandle** indices, int64_t indices_len_, AtenTensorHandle values, int32_t accumulate, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_index_put(AtenTensorHandle self, const AtenTensorHandle** indices, int64_t indices_len_, AtenTensorHandle values, int32_t accumulate, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__unsafe_index_put(AtenTensorHandle self, const AtenTensorHandle** indices, int64_t indices_len_, AtenTensorHandle values, int32_t accumulate, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__index_put_impl_(AtenTensorHandle self, const AtenTensorHandle** indices, int64_t indices_len_, AtenTensorHandle values, int32_t accumulate, int32_t unsafe, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_isin_Tensor_Tensor_out(AtenTensorHandle out, AtenTensorHandle elements, AtenTensorHandle test_elements, int32_t assume_unique, int32_t invert);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_isin_Tensor_Scalar_out(AtenTensorHandle out, AtenTensorHandle elements, double test_element, int32_t assume_unique, int32_t invert);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_isin_Scalar_Tensor_out(AtenTensorHandle out, double element, AtenTensorHandle test_elements, int32_t assume_unique, int32_t invert);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_is_same_size(AtenTensorHandle self, AtenTensorHandle other, int32_t* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_kthvalue(AtenTensorHandle self, int64_t k, int64_t dim, int32_t keepdim, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_kthvalue_values(AtenTensorHandle values, AtenTensorHandle indices, AtenTensorHandle self, int64_t k, int64_t dim, int32_t keepdim);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_native_layer_norm(AtenTensorHandle input, const int64_t* normalized_shape, int64_t normalized_shape_len_, AtenTensorHandle* weight, AtenTensorHandle* bias, double eps, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_native_layer_norm_backward(AtenTensorHandle grad_out, AtenTensorHandle input, const int64_t* normalized_shape, int64_t normalized_shape_len_, AtenTensorHandle mean, AtenTensorHandle rstd, AtenTensorHandle* weight, AtenTensorHandle* bias, const int32_t* output_mask, int64_t output_mask_len_, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_linear_out(AtenTensorHandle out, AtenTensorHandle input, AtenTensorHandle weight, AtenTensorHandle* bias);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_linspace(double start, double end, int64_t steps, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_linspace_Tensor_Tensor(AtenTensorHandle start, AtenTensorHandle end, int64_t steps, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_linspace_Tensor_Scalar(AtenTensorHandle start, double end, int64_t steps, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_linspace_Scalar_Tensor(double start, AtenTensorHandle end, int64_t steps, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_linspace_out(AtenTensorHandle out, double start, double end, int64_t steps);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_linspace_Tensor_Tensor_out(AtenTensorHandle out, AtenTensorHandle start, AtenTensorHandle end, int64_t steps);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_linspace_Tensor_Scalar_out(AtenTensorHandle out, AtenTensorHandle start, double end, int64_t steps);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_linspace_Scalar_Tensor_out(AtenTensorHandle out, double start, AtenTensorHandle end, int64_t steps);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_xlogy__Scalar_Other(AtenTensorHandle self, double other, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_logspace(double start, double end, int64_t steps, double base, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_logspace_Tensor_Tensor(AtenTensorHandle start, AtenTensorHandle end, int64_t steps, double base, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_logspace_Tensor_Scalar(AtenTensorHandle start, double end, int64_t steps, double base, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_logspace_Scalar_Tensor(double start, AtenTensorHandle end, int64_t steps, double base, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_logspace_out(AtenTensorHandle out, double start, double end, int64_t steps, double base);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_logspace_Tensor_Tensor_out(AtenTensorHandle out, AtenTensorHandle start, AtenTensorHandle end, int64_t steps, double base);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_logspace_Tensor_Scalar_out(AtenTensorHandle out, AtenTensorHandle start, double end, int64_t steps, double base);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_logspace_Scalar_Tensor_out(AtenTensorHandle out, double start, AtenTensorHandle end, int64_t steps, double base);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_log_softmax_int_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, int32_t* dtype);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__log_softmax_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, int32_t half_to_float);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__log_softmax_backward_data_out(AtenTensorHandle out, AtenTensorHandle grad_output, AtenTensorHandle output, int64_t dim, int32_t input_dtype);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__logcumsumexp(AtenTensorHandle self, int64_t dim, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__logcumsumexp_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_logcumsumexp(AtenTensorHandle self, int64_t dim, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_logcumsumexp_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_logsumexp(AtenTensorHandle self, const int64_t* dim, int64_t dim_len_, int32_t keepdim, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_logsumexp_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* dim, int64_t dim_len_, int32_t keepdim);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__aminmax(AtenTensorHandle self, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__aminmax_dim(AtenTensorHandle self, int64_t dim, int32_t keepdim, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_aminmax_out(AtenTensorHandle min, AtenTensorHandle max, AtenTensorHandle self, int64_t* dim, int32_t keepdim);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__compute_linear_combination(AtenTensorHandle input, AtenTensorHandle coefficients, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__compute_linear_combination_out(AtenTensorHandle out, AtenTensorHandle input, AtenTensorHandle coefficients);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_max_dim_max(AtenTensorHandle max, AtenTensorHandle max_values, AtenTensorHandle self, int64_t dim, int32_t keepdim);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_amax_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* dim, int64_t dim_len_, int32_t keepdim);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_mean(AtenTensorHandle self, int32_t* dtype, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_mean_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t** dim, int64_t dim_len_, int32_t keepdim, int32_t* dtype);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_median(AtenTensorHandle self, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_median_dim(AtenTensorHandle self, int64_t dim, int32_t keepdim, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_median_dim_values(AtenTensorHandle values, AtenTensorHandle indices, AtenTensorHandle self, int64_t dim, int32_t keepdim);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_nanmedian(AtenTensorHandle self, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_nanmedian_dim(AtenTensorHandle self, int64_t dim, int32_t keepdim, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_nanmedian_dim_values(AtenTensorHandle values, AtenTensorHandle indices, AtenTensorHandle self, int64_t dim, int32_t keepdim);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_min_dim_min(AtenTensorHandle min, AtenTensorHandle min_indices, AtenTensorHandle self, int64_t dim, int32_t keepdim);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_amin_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* dim, int64_t dim_len_, int32_t keepdim);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_mkldnn_convolution(AtenTensorHandle self, AtenTensorHandle weight, AtenTensorHandle* bias, const int64_t* padding, int64_t padding_len_, const int64_t* stride, int64_t stride_len_, const int64_t* dilation, int64_t dilation_len_, int64_t groups, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_mkldnn_rnn_layer(AtenTensorHandle input, AtenTensorHandle weight0, AtenTensorHandle weight1, AtenTensorHandle weight2, AtenTensorHandle weight3, AtenTensorHandle hx_, AtenTensorHandle cx_, int32_t reverse, const int64_t* batch_sizes, int64_t batch_sizes_len_, int64_t mode, int64_t hidden_size, int64_t num_layers, int32_t has_biases, int32_t bidirectional, int32_t batch_first, int32_t train, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_mkldnn_rnn_layer_backward(AtenTensorHandle input, AtenTensorHandle weight1, AtenTensorHandle weight2, AtenTensorHandle weight3, AtenTensorHandle weight4, AtenTensorHandle hx_, AtenTensorHandle cx_tmp, AtenTensorHandle output, AtenTensorHandle hy_, AtenTensorHandle cy_, AtenTensorHandle* grad_output, AtenTensorHandle* grad_hy, AtenTensorHandle* grad_cy, int32_t reverse, int64_t mode, int64_t hidden_size, int64_t num_layers, int32_t has_biases, int32_t train, int32_t bidirectional, const int64_t* batch_sizes, int64_t batch_sizes_len_, int32_t batch_first, AtenTensorHandle workspace, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3, AtenTensorHandle* ret4, AtenTensorHandle* ret5, AtenTensorHandle* ret6);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_mm_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle mat2);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__int_mm(AtenTensorHandle self, AtenTensorHandle mat2, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__int_mm_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle mat2);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__convert_weight_to_int4pack(AtenTensorHandle self, int64_t innerKTiles, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__weight_int4pack_mm(AtenTensorHandle self, AtenTensorHandle mat2, int64_t qGroupSize, AtenTensorHandle qScaleAndZeros, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__weight_int8pack_mm(AtenTensorHandle self, AtenTensorHandle mat2, AtenTensorHandle scales, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_mode(AtenTensorHandle self, int64_t dim, int32_t keepdim, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_mode_values(AtenTensorHandle values, AtenTensorHandle indices, AtenTensorHandle self, int64_t dim, int32_t keepdim);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_mv(AtenTensorHandle self, AtenTensorHandle vec, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_mv_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle vec);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_narrow_copy(AtenTensorHandle self, int64_t dim, int64_t start, int64_t length, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_narrow_copy_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, int64_t start, int64_t length);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_native_batch_norm(AtenTensorHandle input, AtenTensorHandle* weight, AtenTensorHandle* bias, AtenTensorHandle* running_mean, AtenTensorHandle* running_var, int32_t training, double momentum, double eps, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_native_batch_norm_out(AtenTensorHandle out, AtenTensorHandle save_mean, AtenTensorHandle save_invstd, AtenTensorHandle input, AtenTensorHandle* weight, AtenTensorHandle* bias, AtenTensorHandle* running_mean, AtenTensorHandle* running_var, int32_t training, double momentum, double eps);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__native_batch_norm_legit(AtenTensorHandle input, AtenTensorHandle* weight, AtenTensorHandle* bias, AtenTensorHandle running_mean, AtenTensorHandle running_var, int32_t training, double momentum, double eps, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__native_batch_norm_legit_no_training(AtenTensorHandle input, AtenTensorHandle* weight, AtenTensorHandle* bias, AtenTensorHandle running_mean, AtenTensorHandle running_var, double momentum, double eps, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__native_batch_norm_legit_out(AtenTensorHandle out, AtenTensorHandle save_mean, AtenTensorHandle save_invstd, AtenTensorHandle input, AtenTensorHandle* weight, AtenTensorHandle* bias, AtenTensorHandle running_mean, AtenTensorHandle running_var, int32_t training, double momentum, double eps);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__native_batch_norm_legit_no_stats(AtenTensorHandle input, AtenTensorHandle* weight, AtenTensorHandle* bias, int32_t training, double momentum, double eps, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__native_batch_norm_legit_no_stats_out(AtenTensorHandle out, AtenTensorHandle save_mean, AtenTensorHandle save_invstd, AtenTensorHandle input, AtenTensorHandle* weight, AtenTensorHandle* bias, int32_t training, double momentum, double eps);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_native_batch_norm_backward(AtenTensorHandle grad_out, AtenTensorHandle input, AtenTensorHandle* weight, AtenTensorHandle* running_mean, AtenTensorHandle* running_var, AtenTensorHandle* save_mean, AtenTensorHandle* save_invstd, int32_t train, double eps, const int32_t* output_mask, int64_t output_mask_len_, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_batch_norm_update_stats(AtenTensorHandle input, AtenTensorHandle* running_mean, AtenTensorHandle* running_var, double momentum, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__nnpack_spatial_convolution(AtenTensorHandle input, AtenTensorHandle weight, AtenTensorHandle* bias, const int64_t* padding, int64_t padding_len_, const int64_t* stride, int64_t stride_len_, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_ones(const int64_t* size, int64_t size_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_ones_out(AtenTensorHandle out, const int64_t* size, int64_t size_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_ones_like(AtenTensorHandle self, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, int32_t* memory_format, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__euclidean_dist(AtenTensorHandle x1, AtenTensorHandle x2, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__cdist_forward(AtenTensorHandle x1, AtenTensorHandle x2, double p, int64_t* compute_mode, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__cdist_backward(AtenTensorHandle grad, AtenTensorHandle x1, AtenTensorHandle x2, double p, AtenTensorHandle cdist, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__pdist_forward(AtenTensorHandle self, double p, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__pdist_backward(AtenTensorHandle grad, AtenTensorHandle self, double p, AtenTensorHandle pdist, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_permute(AtenTensorHandle self, const int64_t* dims, int64_t dims_len_, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_pixel_shuffle(AtenTensorHandle self, int64_t upscale_factor, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_pixel_unshuffle(AtenTensorHandle self, int64_t downscale_factor, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_channel_shuffle(AtenTensorHandle self, int64_t groups, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_native_channel_shuffle(AtenTensorHandle self, int64_t groups, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_is_pinned(AtenTensorHandle self, int32_t* device, int32_t device_index_, int32_t* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_rad2deg(AtenTensorHandle self, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_rad2deg_(AtenTensorHandle self, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_rad2deg_out(AtenTensorHandle out, AtenTensorHandle self);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_scalar_tensor(double s, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_rand(const int64_t* size, int64_t size_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_rand_out(AtenTensorHandle out, const int64_t* size, int64_t size_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_rand_like(AtenTensorHandle self, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, int32_t* memory_format, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_randint(int64_t high, const int64_t* size, int64_t size_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_randint_low(int64_t low, int64_t high, const int64_t* size, int64_t size_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_randint_out(AtenTensorHandle out, int64_t high, const int64_t* size, int64_t size_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_randint_low_out(AtenTensorHandle out, int64_t low, int64_t high, const int64_t* size, int64_t size_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_randint_like(AtenTensorHandle self, int64_t high, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, int32_t* memory_format, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_randint_like_low_dtype(AtenTensorHandle self, int64_t low, int64_t high, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, int32_t* memory_format, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_randn(const int64_t* size, int64_t size_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_randn_like(AtenTensorHandle self, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, int32_t* memory_format, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_randperm(int64_t n, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_randperm_out(AtenTensorHandle out, int64_t n);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_range_step(double start, double end, double step, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_range(double start, double end, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_range_out_(AtenTensorHandle out, double start, double end);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_range_out(AtenTensorHandle out, double start, double end, double step);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_repeat(AtenTensorHandle self, const int64_t* repeats, int64_t repeats_len_, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_repeat_interleave_Tensor(AtenTensorHandle repeats, int64_t* output_size, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__reshape_copy(AtenTensorHandle self, const int64_t* size, int64_t size_len_, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__reshape_alias(AtenTensorHandle self, const int64_t* size, int64_t size_len_, const int64_t* stride, int64_t stride_len_, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__prelu_kernel(AtenTensorHandle self, AtenTensorHandle weight, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__prelu_kernel_backward(AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle weight, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_gelu_out(AtenTensorHandle out, AtenTensorHandle self, const char* approximate);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_gelu_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, AtenTensorHandle self, const char* approximate);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_hardshrink_out(AtenTensorHandle out, AtenTensorHandle self, double lambd);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_hardshrink_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_out, AtenTensorHandle self, double lambd);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_select_int(AtenTensorHandle self, int64_t dim, int64_t index, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_select_backward(AtenTensorHandle grad_output, const int64_t* input_sizes, int64_t input_sizes_len_, int64_t dim, int64_t index, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_celu(AtenTensorHandle self, double alpha, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_celu_(AtenTensorHandle self, double alpha, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_mish_out(AtenTensorHandle out, AtenTensorHandle self);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_mish_backward(AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_detach(AtenTensorHandle self, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_detach_(AtenTensorHandle self, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_slice_Tensor(AtenTensorHandle self, int64_t dim, int64_t* start, int64_t* end, int64_t step, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_slice_backward(AtenTensorHandle grad_output, const int64_t* input_sizes, int64_t input_sizes_len_, int64_t dim, int64_t start, int64_t end, int64_t step, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_slice_inverse(AtenTensorHandle self, AtenTensorHandle src, int64_t dim, int64_t* start, int64_t* end, int64_t step, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_slice_scatter(AtenTensorHandle self, AtenTensorHandle src, int64_t dim, int64_t* start, int64_t* end, int64_t step, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_select_scatter(AtenTensorHandle self, AtenTensorHandle src, int64_t dim, int64_t index, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_diagonal_scatter(AtenTensorHandle self, AtenTensorHandle src, int64_t offset, int64_t dim1, int64_t dim2, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_as_strided_scatter(AtenTensorHandle self, AtenTensorHandle src, const int64_t* size, int64_t size_len_, const int64_t* stride, int64_t stride_len_, int64_t* storage_offset, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_softmax_int_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, int32_t* dtype);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__softmax_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, int32_t half_to_float);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__softmax_backward_data_out(AtenTensorHandle grad_input, AtenTensorHandle grad_output, AtenTensorHandle output, int64_t dim, int32_t input_dtype);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_squeeze(AtenTensorHandle self, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_squeeze_dim(AtenTensorHandle self, int64_t dim, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_squeeze_dims(AtenTensorHandle self, const int64_t* dim, int64_t dim_len_, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_squeeze_(AtenTensorHandle self, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_squeeze__dim(AtenTensorHandle self, int64_t dim, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_squeeze__dims(AtenTensorHandle self, const int64_t* dim, int64_t dim_len_, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_sspaddmm_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle mat1, AtenTensorHandle mat2, double beta, double alpha);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__chunk_cat(const AtenTensorHandle* tensors, int64_t tensors_len_, int64_t dim, int64_t num_chunks, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__chunk_cat_out(AtenTensorHandle out, const AtenTensorHandle* tensors, int64_t tensors_len_, int64_t dim, int64_t num_chunks);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_stack(const AtenTensorHandle* tensors, int64_t tensors_len_, int64_t dim, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_stack_out(AtenTensorHandle out, const AtenTensorHandle* tensors, int64_t tensors_len_, int64_t dim);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__stack(const AtenTensorHandle* tensors, int64_t tensors_len_, int64_t dim, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__stack_out(AtenTensorHandle out, const AtenTensorHandle* tensors, int64_t tensors_len_, int64_t dim);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_sum(AtenTensorHandle self, int32_t* dtype, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_sum_IntList_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t** dim, int64_t dim_len_, int32_t keepdim, int32_t* dtype);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_nansum(AtenTensorHandle self, const int64_t** dim, int64_t dim_len_, int32_t keepdim, int32_t* dtype, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_nansum_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t** dim, int64_t dim_len_, int32_t keepdim, int32_t* dtype);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_std_correction(AtenTensorHandle self, const int64_t** dim, int64_t dim_len_, double* correction, int32_t keepdim, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_std_mean_correction(AtenTensorHandle self, const int64_t** dim, int64_t dim_len_, double* correction, int32_t keepdim, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_std_correction_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t** dim, int64_t dim_len_, double* correction, int32_t keepdim);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_prod(AtenTensorHandle self, int32_t* dtype, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_prod_int_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, int32_t keepdim, int32_t* dtype);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_t(AtenTensorHandle self, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_t_(AtenTensorHandle self, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_threshold_out(AtenTensorHandle out, AtenTensorHandle self, double threshold, double value);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_threshold_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, AtenTensorHandle self, double threshold);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_transpose_int(AtenTensorHandle self, int64_t dim0, int64_t dim1, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_transpose_(AtenTensorHandle self, int64_t dim0, int64_t dim1, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_flip(AtenTensorHandle self, const int64_t* dims, int64_t dims_len_, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_roll(AtenTensorHandle self, const int64_t* shifts, int64_t shifts_len_, const int64_t* dims, int64_t dims_len_, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_rot90(AtenTensorHandle self, int64_t k, const int64_t* dims, int64_t dims_len_, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__transform_bias_rescale_qkv(AtenTensorHandle qkv, AtenTensorHandle qkv_bias, int64_t num_heads, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__nested_tensor_from_mask(AtenTensorHandle t, AtenTensorHandle mask, int32_t mask_check, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__nested_tensor_from_mask_left_aligned(AtenTensorHandle t, AtenTensorHandle mask, int32_t* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__nested_from_padded(AtenTensorHandle padded, AtenTensorHandle cpu_nested_shape_example, int32_t fuse_transform_0213, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__nested_view_from_buffer(AtenTensorHandle self, AtenTensorHandle nested_size, AtenTensorHandle nested_strides, AtenTensorHandle offsets, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__nested_view_from_buffer_copy(AtenTensorHandle self, AtenTensorHandle nested_size, AtenTensorHandle nested_strides, AtenTensorHandle offsets, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__nested_view_from_jagged_copy(AtenTensorHandle self, AtenTensorHandle offsets, AtenTensorHandle dummy, AtenTensorHandle* lengths, int64_t ragged_idx, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__nested_get_values_copy(AtenTensorHandle self, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__nested_compute_contiguous_strides_offsets(AtenTensorHandle nested_size, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__trilinear(AtenTensorHandle i1, AtenTensorHandle i2, AtenTensorHandle i3, const int64_t* expand1, int64_t expand1_len_, const int64_t* expand2, int64_t expand2_len_, const int64_t* expand3, int64_t expand3_len_, const int64_t* sumdim, int64_t sumdim_len_, int64_t unroll_dim, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__unique(AtenTensorHandle self, int32_t sorted, int32_t return_inverse, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_unique_dim(AtenTensorHandle self, int64_t dim, int32_t sorted, int32_t return_inverse, int32_t return_counts, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_unique_consecutive(AtenTensorHandle self, int32_t return_inverse, int32_t return_counts, int64_t* dim, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_unique_dim_consecutive(AtenTensorHandle self, int64_t dim, int32_t return_inverse, int32_t return_counts, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__unique2(AtenTensorHandle self, int32_t sorted, int32_t return_inverse, int32_t return_counts, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__unsafe_view(AtenTensorHandle self, const int64_t* size, int64_t size_len_, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_unsqueeze(AtenTensorHandle self, int64_t dim, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_unsqueeze_(AtenTensorHandle self, int64_t dim, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_var_correction(AtenTensorHandle self, const int64_t** dim, int64_t dim_len_, double* correction, int32_t keepdim, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_var_correction_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t** dim, int64_t dim_len_, double* correction, int32_t keepdim);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_var_mean_correction(AtenTensorHandle self, const int64_t** dim, int64_t dim_len_, double* correction, int32_t keepdim, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_where_self_out(AtenTensorHandle out, AtenTensorHandle condition, AtenTensorHandle self, AtenTensorHandle other);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__weight_norm_interface(AtenTensorHandle v, AtenTensorHandle g, int64_t dim, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__weight_norm_interface_backward(AtenTensorHandle grad_w, AtenTensorHandle saved_v, AtenTensorHandle saved_g, AtenTensorHandle saved_norms, int64_t dim, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__efficientzerotensor(const int64_t* size, int64_t size_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_zeros(const int64_t* size, int64_t size_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_zeros_out(AtenTensorHandle out, const int64_t* size, int64_t size_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_zeros_like(AtenTensorHandle self, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, int32_t* memory_format, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__standard_gamma_grad(AtenTensorHandle self, AtenTensorHandle output, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__dirichlet_grad(AtenTensorHandle x, AtenTensorHandle alpha, AtenTensorHandle total, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__batch_norm_with_update(AtenTensorHandle input, AtenTensorHandle* weight, AtenTensorHandle* bias, AtenTensorHandle running_mean, AtenTensorHandle running_var, double momentum, double eps, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__batch_norm_with_update_out(AtenTensorHandle out, AtenTensorHandle save_mean, AtenTensorHandle save_invstd, AtenTensorHandle reserve, AtenTensorHandle input, AtenTensorHandle* weight, AtenTensorHandle* bias, AtenTensorHandle running_mean, AtenTensorHandle running_var, double momentum, double eps);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__batch_norm_no_update(AtenTensorHandle input, AtenTensorHandle* weight, AtenTensorHandle* bias, AtenTensorHandle* running_mean, AtenTensorHandle* running_var, double momentum, double eps, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_batch_norm_backward(AtenTensorHandle grad_out, AtenTensorHandle input, AtenTensorHandle weight, AtenTensorHandle* running_mean, AtenTensorHandle* running_var, AtenTensorHandle* save_mean, AtenTensorHandle* save_var, int32_t update, double eps, const int32_t* output_mask, int64_t output_mask_len_, AtenTensorHandle reserve, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__sparse_sum_dim(AtenTensorHandle self, const int64_t* dim, int64_t dim_len_, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__spdiags(AtenTensorHandle diagonals, AtenTensorHandle offsets, const int64_t* shape, int64_t shape_len_, int32_t* layout, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_norm_ScalarOpt_dtype(AtenTensorHandle self, double* p, int32_t dtype, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_norm_Scalar(AtenTensorHandle self, double p, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_norm_dtype_out(AtenTensorHandle out, AtenTensorHandle self, double* p, const int64_t* dim, int64_t dim_len_, int32_t keepdim, int32_t dtype);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_norm_out(AtenTensorHandle out, AtenTensorHandle self, double* p, const int64_t* dim, int64_t dim_len_, int32_t keepdim);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_resize_as_(AtenTensorHandle self, AtenTensorHandle the_template, int32_t* memory_format, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_zero_(AtenTensorHandle self, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_rsub_Tensor(AtenTensorHandle self, AtenTensorHandle other, double alpha, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__sparse_addmm(AtenTensorHandle self, AtenTensorHandle mat1, AtenTensorHandle mat2, double beta, double alpha, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_addmm_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle mat1, AtenTensorHandle mat2, double beta, double alpha);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__addmm_activation_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle mat1, AtenTensorHandle mat2, double beta, double alpha, int32_t use_gelu);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__sparse_compressed_tensor_with_dims(int64_t nnz, int64_t dense_dim, const int64_t* size, int64_t size_len_, const int64_t* blocksize, int64_t blocksize_len_, int32_t index_dtype, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_sparse_compressed_tensor_comp_plain_value_size(AtenTensorHandle compressed_indices, AtenTensorHandle plain_indices, AtenTensorHandle values, const int64_t* size, int64_t size_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_sparse_compressed_tensor_comp_plain_value(AtenTensorHandle compressed_indices, AtenTensorHandle plain_indices, AtenTensorHandle values, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_sparse_coo_tensor_size(const int64_t* size, int64_t size_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_sparse_dim(AtenTensorHandle self, int64_t* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_dense_dim(AtenTensorHandle self, int64_t* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_is_coalesced(AtenTensorHandle self, int32_t* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_indices(AtenTensorHandle self, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_values(AtenTensorHandle self, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_crow_indices(AtenTensorHandle self, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_col_indices(AtenTensorHandle self, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_ccol_indices(AtenTensorHandle self, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_row_indices(AtenTensorHandle self, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__to_sparse_sparse_dim(AtenTensorHandle self, int64_t sparse_dim, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__to_sparse(AtenTensorHandle self, int32_t* layout, const int64_t** blocksize, int64_t blocksize_len_, int64_t* dense_dim, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__to_sparse_csr(AtenTensorHandle self, int64_t* dense_dim, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__to_sparse_csc(AtenTensorHandle self, int64_t* dense_dim, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__to_sparse_bsr(AtenTensorHandle self, const int64_t* blocksize, int64_t blocksize_len_, int64_t* dense_dim, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__to_sparse_bsc(AtenTensorHandle self, const int64_t* blocksize, int64_t blocksize_len_, int64_t* dense_dim, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_to_mkldnn(AtenTensorHandle self, int32_t* dtype, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_quantize_per_tensor_dynamic(AtenTensorHandle self, int32_t dtype, int32_t reduce_range, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_quantize_per_tensor(AtenTensorHandle self, double scale, int64_t zero_point, int32_t dtype, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_quantize_per_tensor_tensor_qparams(AtenTensorHandle self, AtenTensorHandle scale, AtenTensorHandle zero_point, int32_t dtype, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_quantize_per_channel(AtenTensorHandle self, AtenTensorHandle scales, AtenTensorHandle zero_points, int64_t axis, int32_t dtype, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_dequantize_self(AtenTensorHandle self, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__make_per_tensor_quantized_tensor(AtenTensorHandle self, double scale, int64_t zero_point, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__make_per_channel_quantized_tensor(AtenTensorHandle self, AtenTensorHandle scale, AtenTensorHandle zero_point, int64_t axis, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_fake_quantize_per_tensor_affine_cachemask(AtenTensorHandle self, double scale, int64_t zero_point, int64_t quant_min, int64_t quant_max, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__fake_quantize_per_tensor_affine_cachemask_tensor_qparams(AtenTensorHandle self, AtenTensorHandle scale, AtenTensorHandle zero_point, AtenTensorHandle fake_quant_enabled, int64_t quant_min, int64_t quant_max, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__fake_quantize_learnable_per_tensor_affine(AtenTensorHandle self, AtenTensorHandle scale, AtenTensorHandle zero_point, int64_t quant_min, int64_t quant_max, double grad_factor, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__fake_quantize_learnable_per_tensor_affine_backward(AtenTensorHandle grad, AtenTensorHandle self, AtenTensorHandle scale, AtenTensorHandle zero_point, int64_t quant_min, int64_t quant_max, double grad_factor, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_fake_quantize_per_channel_affine_cachemask(AtenTensorHandle self, AtenTensorHandle scale, AtenTensorHandle zero_point, int64_t axis, int64_t quant_min, int64_t quant_max, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__fake_quantize_learnable_per_channel_affine(AtenTensorHandle self, AtenTensorHandle scale, AtenTensorHandle zero_point, int64_t axis, int64_t quant_min, int64_t quant_max, double grad_factor, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__fake_quantize_learnable_per_channel_affine_backward(AtenTensorHandle grad, AtenTensorHandle self, AtenTensorHandle scale, AtenTensorHandle zero_point, int64_t axis, int64_t quant_min, int64_t quant_max, double grad_factor, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__fused_moving_avg_obs_fq_helper(AtenTensorHandle self, AtenTensorHandle observer_on, AtenTensorHandle fake_quant_on, AtenTensorHandle running_min, AtenTensorHandle running_max, AtenTensorHandle scale, AtenTensorHandle zero_point, double averaging_const, int64_t quant_min, int64_t quant_max, int64_t ch_axis, int32_t per_row_fake_quant, int32_t symmetric_quant, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__to_copy(AtenTensorHandle self, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, int32_t non_blocking, int32_t* memory_format, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__local_scalar_dense(AtenTensorHandle self, double* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__pack_padded_sequence(AtenTensorHandle input, AtenTensorHandle lengths, int32_t batch_first, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_set__source_Tensor(AtenTensorHandle self, AtenTensorHandle source, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_set_(AtenTensorHandle self, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_lift(AtenTensorHandle self, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_lift_fresh(AtenTensorHandle self, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_lift_fresh_copy(AtenTensorHandle self, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_is_set_to(AtenTensorHandle self, AtenTensorHandle tensor, int32_t* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_masked_fill__Scalar(AtenTensorHandle self, AtenTensorHandle mask, double value, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_masked_fill__Tensor(AtenTensorHandle self, AtenTensorHandle mask, AtenTensorHandle value, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_masked_fill_Tensor(AtenTensorHandle self, AtenTensorHandle mask, AtenTensorHandle value, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_masked_scatter_(AtenTensorHandle self, AtenTensorHandle mask, AtenTensorHandle source, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_masked_scatter(AtenTensorHandle self, AtenTensorHandle mask, AtenTensorHandle source, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_masked_scatter_backward(AtenTensorHandle grad_output, AtenTensorHandle mask, const int64_t* sizes, int64_t sizes_len_, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__masked_softmax(AtenTensorHandle self, AtenTensorHandle mask, int64_t* dim, int64_t* mask_type, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__masked_softmax_backward(AtenTensorHandle grad_output, AtenTensorHandle output, AtenTensorHandle mask, int64_t* dim, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_view(AtenTensorHandle self, const int64_t* size, int64_t size_len_, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_view_dtype(AtenTensorHandle self, int32_t dtype, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_put_(AtenTensorHandle self, AtenTensorHandle index, AtenTensorHandle source, int32_t accumulate, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_put(AtenTensorHandle self, AtenTensorHandle index, AtenTensorHandle source, int32_t accumulate, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_index_add_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, AtenTensorHandle index, AtenTensorHandle source, double alpha);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_index_reduce_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, AtenTensorHandle index, AtenTensorHandle source, const char* reduce, int32_t include_self);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_index_fill__int_Scalar(AtenTensorHandle self, int64_t dim, AtenTensorHandle index, double value, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_index_fill_int_Scalar(AtenTensorHandle self, int64_t dim, AtenTensorHandle index, double value, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_index_fill__int_Tensor(AtenTensorHandle self, int64_t dim, AtenTensorHandle index, AtenTensorHandle value, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_index_fill_int_Tensor(AtenTensorHandle self, int64_t dim, AtenTensorHandle index, AtenTensorHandle value, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_scatter_src_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, AtenTensorHandle index, AtenTensorHandle src);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_scatter_value_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, AtenTensorHandle index, double value);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_scatter_reduce_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, AtenTensorHandle index, AtenTensorHandle src, const char* reduce);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_scatter_value_reduce_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, AtenTensorHandle index, double value, const char* reduce);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_scatter_add_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, AtenTensorHandle index, AtenTensorHandle src);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_scatter_reduce_two_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, AtenTensorHandle index, AtenTensorHandle src, const char* reduce, int32_t include_self);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu___irshift___Scalar(AtenTensorHandle self, double other, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu___irshift___Tensor(AtenTensorHandle self, AtenTensorHandle other, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_addbmm_(AtenTensorHandle self, AtenTensorHandle batch1, AtenTensorHandle batch2, double beta, double alpha, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_addbmm_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle batch1, AtenTensorHandle batch2, double beta, double alpha);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_addbmm(AtenTensorHandle self, AtenTensorHandle batch1, AtenTensorHandle batch2, double beta, double alpha, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_triu_out(AtenTensorHandle out, AtenTensorHandle self, int64_t diagonal);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_tril_out(AtenTensorHandle out, AtenTensorHandle self, int64_t diagonal);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_tril_indices(int64_t row, int64_t col, int64_t offset, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_triu_indices(int64_t row, int64_t col, int64_t offset, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_trace(AtenTensorHandle self, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_take_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle index);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_take(AtenTensorHandle self, AtenTensorHandle index, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_index_select_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, AtenTensorHandle index);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_index_select(AtenTensorHandle self, int64_t dim, AtenTensorHandle index, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_masked_select_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle mask);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_masked_select(AtenTensorHandle self, AtenTensorHandle mask, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_nonzero_out(AtenTensorHandle out, AtenTensorHandle self);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_nonzero(AtenTensorHandle self, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_nonzero_static_out(AtenTensorHandle out, AtenTensorHandle self, int64_t size, int64_t fill_value);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_nonzero_static(AtenTensorHandle self, int64_t size, int64_t fill_value, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_gather_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, AtenTensorHandle index, int32_t sparse_grad);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_triangular_solve_X(AtenTensorHandle X, AtenTensorHandle M, AtenTensorHandle self, AtenTensorHandle A, int32_t upper, int32_t transpose, int32_t unitriangular);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__linalg_check_errors(AtenTensorHandle info, const char* api_name, int32_t is_matrix);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_linalg_solve_triangular_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle B, int32_t upper, int32_t left, int32_t unitriangular);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_linalg_solve_triangular(AtenTensorHandle self, AtenTensorHandle B, int32_t upper, int32_t left, int32_t unitriangular, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_cholesky_out(AtenTensorHandle out, AtenTensorHandle self, int32_t upper);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_cholesky(AtenTensorHandle self, int32_t upper, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_cholesky_solve_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle input2, int32_t upper);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_cholesky_solve(AtenTensorHandle self, AtenTensorHandle input2, int32_t upper, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__cholesky_solve_helper(AtenTensorHandle self, AtenTensorHandle A, int32_t upper, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_cholesky_inverse(AtenTensorHandle self, int32_t upper, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_cholesky_inverse_out(AtenTensorHandle out, AtenTensorHandle self, int32_t upper);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_geqrf_a(AtenTensorHandle a, AtenTensorHandle tau, AtenTensorHandle self);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_geqrf(AtenTensorHandle self, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_ormqr_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle input2, AtenTensorHandle input3, int32_t left, int32_t transpose);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_ormqr(AtenTensorHandle self, AtenTensorHandle input2, AtenTensorHandle input3, int32_t left, int32_t transpose, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_lu_unpack_out(AtenTensorHandle P, AtenTensorHandle L, AtenTensorHandle U, AtenTensorHandle LU_data, AtenTensorHandle LU_pivots, int32_t unpack_data, int32_t unpack_pivots);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_dist(AtenTensorHandle self, AtenTensorHandle other, double p, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_histc_out(AtenTensorHandle out, AtenTensorHandle self, int64_t bins, double min, double max);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_histc(AtenTensorHandle self, int64_t bins, double min, double max, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_histogram_bins_tensor_out(AtenTensorHandle hist, AtenTensorHandle bin_edges, AtenTensorHandle self, AtenTensorHandle bins, AtenTensorHandle* weight, int32_t density);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_histogram_bins_tensor(AtenTensorHandle self, AtenTensorHandle bins, AtenTensorHandle* weight, int32_t density, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_histogram_bin_ct_out(AtenTensorHandle hist, AtenTensorHandle bin_edges, AtenTensorHandle self, int64_t bins, const double** range, int64_t range_len_, AtenTensorHandle* weight, int32_t density);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_histogram_bin_ct(AtenTensorHandle self, int64_t bins, const double** range, int64_t range_len_, AtenTensorHandle* weight, int32_t density, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__histogramdd_from_bin_cts(AtenTensorHandle self, const int64_t* bins, int64_t bins_len_, const double** range, int64_t range_len_, AtenTensorHandle* weight, int32_t density, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__histogramdd_from_bin_tensors(AtenTensorHandle self, const AtenTensorHandle* bins, int64_t bins_len_, AtenTensorHandle* weight, int32_t density, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_min(AtenTensorHandle self, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_min_unary_out(AtenTensorHandle out, AtenTensorHandle self);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_max(AtenTensorHandle self, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_max_unary_out(AtenTensorHandle out, AtenTensorHandle self);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_sort_values(AtenTensorHandle values, AtenTensorHandle indices, AtenTensorHandle self, int64_t dim, int32_t descending);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_sort_values_stable(AtenTensorHandle values, AtenTensorHandle indices, AtenTensorHandle self, int32_t* stable, int64_t dim, int32_t descending);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_sort(AtenTensorHandle self, int64_t dim, int32_t descending, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_argsort_stable(AtenTensorHandle self, int32_t stable, int64_t dim, int32_t descending, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_topk_values(AtenTensorHandle values, AtenTensorHandle indices, AtenTensorHandle self, int64_t k, int64_t dim, int32_t largest, int32_t sorted);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_all_all_out(AtenTensorHandle out, AtenTensorHandle self);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_any_all_out(AtenTensorHandle out, AtenTensorHandle self);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_renorm_out(AtenTensorHandle out, AtenTensorHandle self, double p, int64_t dim, double maxnorm);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_unfold(AtenTensorHandle self, int64_t dimension, int64_t size, int64_t step, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_unfold_backward(AtenTensorHandle grad_in, const int64_t* input_sizes, int64_t input_sizes_len_, int64_t dim, int64_t size, int64_t step, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_alias(AtenTensorHandle self, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__amp_foreach_non_finite_check_and_unscale_(const AtenTensorHandle* self, int64_t self_len_, AtenTensorHandle found_inf, AtenTensorHandle inv_scale);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__amp_update_scale_(AtenTensorHandle self, AtenTensorHandle growth_tracker, AtenTensorHandle found_inf, double scale_growth_factor, double scale_backoff_factor, int64_t growth_interval, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_add__Scalar(const AtenTensorHandle* self, int64_t self_len_, double scalar);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_add__List(const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* other, int64_t other_len_, double alpha);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_add__ScalarList(const AtenTensorHandle* self, int64_t self_len_, const double* scalars, int64_t scalars_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_add__Tensor(const AtenTensorHandle* self, int64_t self_len_, AtenTensorHandle other, double alpha);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_sub__Scalar(const AtenTensorHandle* self, int64_t self_len_, double scalar);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_sub__List(const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* other, int64_t other_len_, double alpha);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_sub__ScalarList(const AtenTensorHandle* self, int64_t self_len_, const double* scalars, int64_t scalars_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_mul__Scalar(const AtenTensorHandle* self, int64_t self_len_, double scalar);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_mul__List(const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* other, int64_t other_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_mul__ScalarList(const AtenTensorHandle* self, int64_t self_len_, const double* scalars, int64_t scalars_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_mul__Tensor(const AtenTensorHandle* self, int64_t self_len_, AtenTensorHandle other);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_div__Scalar(const AtenTensorHandle* self, int64_t self_len_, double scalar);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_div__List(const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* other, int64_t other_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_div__ScalarList(const AtenTensorHandle* self, int64_t self_len_, const double* scalars, int64_t scalars_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_div__Tensor(const AtenTensorHandle* self, int64_t self_len_, AtenTensorHandle other);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_clamp_max__Scalar(const AtenTensorHandle* self, int64_t self_len_, double scalar);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_clamp_max__List(const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* other, int64_t other_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_clamp_max__ScalarList(const AtenTensorHandle* self, int64_t self_len_, const double* scalars, int64_t scalars_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_clamp_min__Scalar(const AtenTensorHandle* self, int64_t self_len_, double scalar);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_clamp_min__List(const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* other, int64_t other_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_clamp_min__ScalarList(const AtenTensorHandle* self, int64_t self_len_, const double* scalars, int64_t scalars_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_maximum__Scalar(const AtenTensorHandle* self, int64_t self_len_, double scalar);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_maximum__List(const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* other, int64_t other_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_maximum__ScalarList(const AtenTensorHandle* self, int64_t self_len_, const double* scalars, int64_t scalars_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_minimum__Scalar(const AtenTensorHandle* self, int64_t self_len_, double scalar);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_minimum__List(const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* other, int64_t other_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_minimum__ScalarList(const AtenTensorHandle* self, int64_t self_len_, const double* scalars, int64_t scalars_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_addcdiv__Scalar(const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* tensor1, int64_t tensor1_len_, const AtenTensorHandle* tensor2, int64_t tensor2_len_, double value);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_addcdiv__ScalarList(const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* tensor1, int64_t tensor1_len_, const AtenTensorHandle* tensor2, int64_t tensor2_len_, const double* scalars, int64_t scalars_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_addcdiv__Tensor(const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* tensor1, int64_t tensor1_len_, const AtenTensorHandle* tensor2, int64_t tensor2_len_, AtenTensorHandle scalars);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_addcmul__Scalar(const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* tensor1, int64_t tensor1_len_, const AtenTensorHandle* tensor2, int64_t tensor2_len_, double value);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_addcmul__ScalarList(const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* tensor1, int64_t tensor1_len_, const AtenTensorHandle* tensor2, int64_t tensor2_len_, const double* scalars, int64_t scalars_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_addcmul__Tensor(const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* tensor1, int64_t tensor1_len_, const AtenTensorHandle* tensor2, int64_t tensor2_len_, AtenTensorHandle scalars);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_abs_(const AtenTensorHandle* self, int64_t self_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_acos_(const AtenTensorHandle* self, int64_t self_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_asin_(const AtenTensorHandle* self, int64_t self_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_atan_(const AtenTensorHandle* self, int64_t self_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_ceil_(const AtenTensorHandle* self, int64_t self_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_cos_(const AtenTensorHandle* self, int64_t self_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_cosh_(const AtenTensorHandle* self, int64_t self_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_erf_(const AtenTensorHandle* self, int64_t self_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_erfc_(const AtenTensorHandle* self, int64_t self_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_exp_(const AtenTensorHandle* self, int64_t self_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_expm1_(const AtenTensorHandle* self, int64_t self_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_floor_(const AtenTensorHandle* self, int64_t self_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_frac_(const AtenTensorHandle* self, int64_t self_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_lerp__List(const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* tensors1, int64_t tensors1_len_, const AtenTensorHandle* weights, int64_t weights_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_lerp__Scalar(const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* tensors1, int64_t tensors1_len_, double weight);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_lgamma_(const AtenTensorHandle* self, int64_t self_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_log_(const AtenTensorHandle* self, int64_t self_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_log10_(const AtenTensorHandle* self, int64_t self_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_log1p_(const AtenTensorHandle* self, int64_t self_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_log2_(const AtenTensorHandle* self, int64_t self_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_neg_(const AtenTensorHandle* self, int64_t self_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_pow__List(const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* exponent, int64_t exponent_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_pow__Scalar(const AtenTensorHandle* self, int64_t self_len_, double exponent);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_pow__ScalarList(const AtenTensorHandle* self, int64_t self_len_, const double* exponent, int64_t exponent_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_reciprocal_(const AtenTensorHandle* self, int64_t self_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_round_(const AtenTensorHandle* self, int64_t self_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_sigmoid_(const AtenTensorHandle* self, int64_t self_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_sign_(const AtenTensorHandle* self, int64_t self_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_sin_(const AtenTensorHandle* self, int64_t self_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_sinh_(const AtenTensorHandle* self, int64_t self_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_sqrt_(const AtenTensorHandle* self, int64_t self_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_tan_(const AtenTensorHandle* self, int64_t self_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_tanh_(const AtenTensorHandle* self, int64_t self_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_trunc_(const AtenTensorHandle* self, int64_t self_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_zero_(const AtenTensorHandle* self, int64_t self_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_copy_(const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* src, int64_t src_len_, int32_t non_blocking);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_bucketize_Tensor(AtenTensorHandle self, AtenTensorHandle boundaries, int32_t out_int32, int32_t right, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_bucketize_Tensor_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle boundaries, int32_t out_int32, int32_t right);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_bucketize_Scalar(double self, AtenTensorHandle boundaries, int32_t out_int32, int32_t right, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_searchsorted_Tensor(AtenTensorHandle sorted_sequence, AtenTensorHandle self, int32_t out_int32, int32_t right, const char** side, AtenTensorHandle* sorter, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_searchsorted_Tensor_out(AtenTensorHandle out, AtenTensorHandle sorted_sequence, AtenTensorHandle self, int32_t out_int32, int32_t right, const char** side, AtenTensorHandle* sorter);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_searchsorted_Scalar(AtenTensorHandle sorted_sequence, double self, int32_t out_int32, int32_t right, const char** side, AtenTensorHandle* sorter, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_searchsorted_Scalar_out(AtenTensorHandle out, AtenTensorHandle sorted_sequence, double self, int32_t out_int32, int32_t right, const char** side, AtenTensorHandle* sorter);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__convert_indices_from_coo_to_csr_out(AtenTensorHandle out, AtenTensorHandle self, int64_t size, int32_t out_int32);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__convert_indices_from_csr_to_coo_out(AtenTensorHandle out, AtenTensorHandle crow_indices, AtenTensorHandle col_indices, int32_t out_int32, int32_t transpose);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_mse_loss_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle target, int64_t reduction);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_mse_loss_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle target, int64_t reduction);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_mse_loss_backward(AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle target, int64_t reduction, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_multi_margin_loss_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle target, double p, double margin, AtenTensorHandle* weight, int64_t reduction);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_multi_margin_loss(AtenTensorHandle self, AtenTensorHandle target, double p, double margin, AtenTensorHandle* weight, int64_t reduction, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_multi_margin_loss_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle target, double p, double margin, AtenTensorHandle* weight, int64_t reduction);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_multi_margin_loss_backward(AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle target, double p, double margin, AtenTensorHandle* weight, int64_t reduction, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_multilabel_margin_loss_forward_output(AtenTensorHandle output, AtenTensorHandle is_target, AtenTensorHandle self, AtenTensorHandle target, int64_t reduction);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_multilabel_margin_loss_forward(AtenTensorHandle self, AtenTensorHandle target, int64_t reduction, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_multilabel_margin_loss_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle target, int64_t reduction, AtenTensorHandle is_target);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_multilabel_margin_loss_backward(AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle target, int64_t reduction, AtenTensorHandle is_target, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_nll_loss_forward_output(AtenTensorHandle output, AtenTensorHandle total_weight, AtenTensorHandle self, AtenTensorHandle target, AtenTensorHandle* weight, int64_t reduction, int64_t ignore_index);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_nll_loss_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle target, AtenTensorHandle* weight, int64_t reduction, int64_t ignore_index, AtenTensorHandle total_weight);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_nll_loss2d_forward_output(AtenTensorHandle output, AtenTensorHandle total_weight, AtenTensorHandle self, AtenTensorHandle target, AtenTensorHandle* weight, int64_t reduction, int64_t ignore_index);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_nll_loss2d_forward(AtenTensorHandle self, AtenTensorHandle target, AtenTensorHandle* weight, int64_t reduction, int64_t ignore_index, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_nll_loss2d_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle target, AtenTensorHandle* weight, int64_t reduction, int64_t ignore_index, AtenTensorHandle total_weight);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_nll_loss2d_backward(AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle target, AtenTensorHandle* weight, int64_t reduction, int64_t ignore_index, AtenTensorHandle total_weight, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_smooth_l1_loss_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle target, int64_t reduction, double beta);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_smooth_l1_loss_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle target, int64_t reduction, double beta);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_smooth_l1_loss_backward(AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle target, int64_t reduction, double beta, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_huber_loss_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle target, int64_t reduction, double delta);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_huber_loss(AtenTensorHandle self, AtenTensorHandle target, int64_t reduction, double delta, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_huber_loss_backward_out(AtenTensorHandle grad_input, AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle target, int64_t reduction, double delta);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_huber_loss_backward(AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle target, int64_t reduction, double delta, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_soft_margin_loss_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle target, int64_t reduction);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_soft_margin_loss(AtenTensorHandle self, AtenTensorHandle target, int64_t reduction, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_soft_margin_loss_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle target, int64_t reduction);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_soft_margin_loss_backward(AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle target, int64_t reduction, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_elu_out(AtenTensorHandle out, AtenTensorHandle self, double alpha, double scale, double input_scale);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_elu_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, double alpha, double scale, double input_scale, int32_t is_result, AtenTensorHandle self_or_result);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_glu_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_glu_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, AtenTensorHandle self, int64_t dim);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_glu_backward(AtenTensorHandle grad_output, AtenTensorHandle self, int64_t dim, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_glu_jvp(AtenTensorHandle glu, AtenTensorHandle x, AtenTensorHandle dx, int64_t dim, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_glu_backward_jvp(AtenTensorHandle grad_x, AtenTensorHandle grad_glu, AtenTensorHandle x, AtenTensorHandle dgrad_glu, AtenTensorHandle dx, int64_t dim, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_hardsigmoid_out(AtenTensorHandle out, AtenTensorHandle self);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_hardsigmoid_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, AtenTensorHandle self);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_hardtanh_out(AtenTensorHandle out, AtenTensorHandle self, double min_val, double max_val);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_hardtanh(AtenTensorHandle self, double min_val, double max_val, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_hardtanh_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, AtenTensorHandle self, double min_val, double max_val);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_hardtanh_backward(AtenTensorHandle grad_output, AtenTensorHandle self, double min_val, double max_val, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_hardtanh_(AtenTensorHandle self, double min_val, double max_val, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_hardswish_out(AtenTensorHandle out, AtenTensorHandle self);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_hardswish(AtenTensorHandle self, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_hardswish_(AtenTensorHandle self, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_hardswish_backward(AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_leaky_relu_out(AtenTensorHandle out, AtenTensorHandle self, double negative_slope);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_leaky_relu_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, AtenTensorHandle self, double negative_slope, int32_t self_is_result);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_log_sigmoid_forward_output(AtenTensorHandle output, AtenTensorHandle buffer, AtenTensorHandle self);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_log_sigmoid_forward(AtenTensorHandle self, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_log_sigmoid_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle buffer);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_log_sigmoid_backward(AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle buffer, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_rrelu_with_noise_backward(AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle noise, double lower, double upper, int32_t training, int32_t self_is_result, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_softplus_out(AtenTensorHandle out, AtenTensorHandle self, double beta, double threshold);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_softplus_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, AtenTensorHandle self, double beta, double threshold);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_softshrink_out(AtenTensorHandle out, AtenTensorHandle self, double lambd);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_softshrink_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, AtenTensorHandle self, double lambd);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_adaptive_avg_pool2d_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* output_size, int64_t output_size_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__adaptive_avg_pool2d(AtenTensorHandle self, const int64_t* output_size, int64_t output_size_len_, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__adaptive_avg_pool2d_backward(AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_adaptive_avg_pool3d_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* output_size, int64_t output_size_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__adaptive_avg_pool3d(AtenTensorHandle self, const int64_t* output_size, int64_t output_size_len_, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_adaptive_avg_pool3d_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, AtenTensorHandle self);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__adaptive_avg_pool3d_backward(AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_adaptive_max_pool2d_out(AtenTensorHandle out, AtenTensorHandle indices, AtenTensorHandle self, const int64_t* output_size, int64_t output_size_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_adaptive_max_pool2d_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle indices);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_adaptive_max_pool3d_out(AtenTensorHandle out, AtenTensorHandle indices, AtenTensorHandle self, const int64_t* output_size, int64_t output_size_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_adaptive_max_pool3d_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle indices);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_avg_pool2d_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, int32_t ceil_mode, int32_t count_include_pad, int64_t* divisor_override);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_avg_pool2d_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, int32_t ceil_mode, int32_t count_include_pad, int64_t* divisor_override);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_avg_pool3d_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, int32_t ceil_mode, int32_t count_include_pad, int64_t* divisor_override);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_avg_pool3d_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, int32_t ceil_mode, int32_t count_include_pad, int64_t* divisor_override);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_fractional_max_pool2d_output(AtenTensorHandle output, AtenTensorHandle indices, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* output_size, int64_t output_size_len_, AtenTensorHandle random_samples);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_fractional_max_pool2d_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* output_size, int64_t output_size_len_, AtenTensorHandle indices);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_fractional_max_pool3d_output(AtenTensorHandle output, AtenTensorHandle indices, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* output_size, int64_t output_size_len_, AtenTensorHandle random_samples);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_fractional_max_pool3d_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* output_size, int64_t output_size_len_, AtenTensorHandle indices);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_fractional_max_pool3d_backward(AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* output_size, int64_t output_size_len_, AtenTensorHandle indices, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_max_pool2d_with_indices_out(AtenTensorHandle out, AtenTensorHandle indices, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t ceil_mode);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_max_pool2d_with_indices_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t ceil_mode, AtenTensorHandle indices);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_max_pool3d_with_indices_out(AtenTensorHandle out, AtenTensorHandle indices, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t ceil_mode);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_max_pool3d_with_indices(AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t ceil_mode, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_max_pool3d_with_indices_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t ceil_mode, AtenTensorHandle indices);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_max_pool3d_with_indices_backward(AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t ceil_mode, AtenTensorHandle indices, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_max_unpool2d_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle indices, const int64_t* output_size, int64_t output_size_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_max_unpool2d(AtenTensorHandle self, AtenTensorHandle indices, const int64_t* output_size, int64_t output_size_len_, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_max_unpool3d_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle indices, const int64_t* output_size, int64_t output_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_max_unpool3d(AtenTensorHandle self, AtenTensorHandle indices, const int64_t* output_size, int64_t output_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_reflection_pad1d_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* padding, int64_t padding_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_reflection_pad1d_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* padding, int64_t padding_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_reflection_pad2d_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* padding, int64_t padding_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_reflection_pad2d(AtenTensorHandle self, const int64_t* padding, int64_t padding_len_, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_reflection_pad2d_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* padding, int64_t padding_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_reflection_pad2d_backward(AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* padding, int64_t padding_len_, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_reflection_pad3d_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* padding, int64_t padding_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_reflection_pad3d_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* padding, int64_t padding_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_replication_pad1d_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* padding, int64_t padding_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_replication_pad1d_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* padding, int64_t padding_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_replication_pad2d_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* padding, int64_t padding_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_replication_pad2d_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* padding, int64_t padding_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_replication_pad2d_backward(AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* padding, int64_t padding_len_, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_replication_pad3d_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* padding, int64_t padding_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_replication_pad3d_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* padding, int64_t padding_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_replication_pad3d_backward(AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* padding, int64_t padding_len_, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_upsample_linear1d_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* output_size, int64_t output_size_len_, int32_t align_corners, double* scales);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_upsample_linear1d_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, const int64_t* output_size, int64_t output_size_len_, const int64_t* input_size, int64_t input_size_len_, int32_t align_corners, double* scales);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_upsample_bilinear2d_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* output_size, int64_t output_size_len_, int32_t align_corners, double* scales_h, double* scales_w);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_upsample_bilinear2d_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, const int64_t* output_size, int64_t output_size_len_, const int64_t* input_size, int64_t input_size_len_, int32_t align_corners, double* scales_h, double* scales_w);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__upsample_bilinear2d_aa_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* output_size, int64_t output_size_len_, int32_t align_corners, double* scales_h, double* scales_w);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__upsample_bilinear2d_aa_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, const int64_t* output_size, int64_t output_size_len_, const int64_t* input_size, int64_t input_size_len_, int32_t align_corners, double* scales_h, double* scales_w);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_upsample_bicubic2d_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* output_size, int64_t output_size_len_, int32_t align_corners, double* scales_h, double* scales_w);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_upsample_bicubic2d_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, const int64_t* output_size, int64_t output_size_len_, const int64_t* input_size, int64_t input_size_len_, int32_t align_corners, double* scales_h, double* scales_w);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__upsample_bicubic2d_aa_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* output_size, int64_t output_size_len_, int32_t align_corners, double* scales_h, double* scales_w);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__upsample_bicubic2d_aa_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, const int64_t* output_size, int64_t output_size_len_, const int64_t* input_size, int64_t input_size_len_, int32_t align_corners, double* scales_h, double* scales_w);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_upsample_trilinear3d_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* output_size, int64_t output_size_len_, int32_t align_corners, double* scales_d, double* scales_h, double* scales_w);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_upsample_trilinear3d_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, const int64_t* output_size, int64_t output_size_len_, const int64_t* input_size, int64_t input_size_len_, int32_t align_corners, double* scales_d, double* scales_h, double* scales_w);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_upsample_nearest1d_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* output_size, int64_t output_size_len_, double* scales);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__upsample_nearest_exact1d_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* output_size, int64_t output_size_len_, double* scales);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_upsample_nearest1d_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, const int64_t* output_size, int64_t output_size_len_, const int64_t* input_size, int64_t input_size_len_, double* scales);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__upsample_nearest_exact1d_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, const int64_t* output_size, int64_t output_size_len_, const int64_t* input_size, int64_t input_size_len_, double* scales);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_upsample_nearest2d_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* output_size, int64_t output_size_len_, double* scales_h, double* scales_w);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__upsample_nearest_exact2d_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* output_size, int64_t output_size_len_, double* scales_h, double* scales_w);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_upsample_nearest2d_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, const int64_t* output_size, int64_t output_size_len_, const int64_t* input_size, int64_t input_size_len_, double* scales_h, double* scales_w);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__upsample_nearest_exact2d_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, const int64_t* output_size, int64_t output_size_len_, const int64_t* input_size, int64_t input_size_len_, double* scales_h, double* scales_w);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_upsample_nearest3d_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* output_size, int64_t output_size_len_, double* scales_d, double* scales_h, double* scales_w);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__upsample_nearest_exact3d_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* output_size, int64_t output_size_len_, double* scales_d, double* scales_h, double* scales_w);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_upsample_nearest3d_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, const int64_t* output_size, int64_t output_size_len_, const int64_t* input_size, int64_t input_size_len_, double* scales_d, double* scales_h, double* scales_w);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__upsample_nearest_exact3d_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, const int64_t* output_size, int64_t output_size_len_, const int64_t* input_size, int64_t input_size_len_, double* scales_d, double* scales_h, double* scales_w);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_slow_conv_transpose2d_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle weight, const int64_t* kernel_size, int64_t kernel_size_len_, AtenTensorHandle* bias, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* output_padding, int64_t output_padding_len_, const int64_t* dilation, int64_t dilation_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_slow_conv_transpose3d_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle weight, const int64_t* kernel_size, int64_t kernel_size_len_, AtenTensorHandle* bias, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* output_padding, int64_t output_padding_len_, const int64_t* dilation, int64_t dilation_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_slow_conv_transpose3d(AtenTensorHandle self, AtenTensorHandle weight, const int64_t* kernel_size, int64_t kernel_size_len_, AtenTensorHandle* bias, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* output_padding, int64_t output_padding_len_, const int64_t* dilation, int64_t dilation_len_, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__slow_conv2d_forward_output(AtenTensorHandle output, AtenTensorHandle self, AtenTensorHandle weight, const int64_t* kernel_size, int64_t kernel_size_len_, AtenTensorHandle* bias, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__slow_conv2d_forward(AtenTensorHandle self, AtenTensorHandle weight, const int64_t* kernel_size, int64_t kernel_size_len_, AtenTensorHandle* bias, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__slow_conv2d_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_weight, AtenTensorHandle grad_bias, AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle weight, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__slow_conv2d_backward_output_mask(AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle weight, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int32_t* output_mask, int64_t output_mask_len_, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_slow_conv3d_forward_output(AtenTensorHandle output, AtenTensorHandle self, AtenTensorHandle weight, const int64_t* kernel_size, int64_t kernel_size_len_, AtenTensorHandle* bias, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_slow_conv3d_forward(AtenTensorHandle self, AtenTensorHandle weight, const int64_t* kernel_size, int64_t kernel_size_len_, AtenTensorHandle* bias, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_slow_conv_dilated2d(AtenTensorHandle self, AtenTensorHandle weight, const int64_t* kernel_size, int64_t kernel_size_len_, AtenTensorHandle* bias, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_slow_conv_dilated3d(AtenTensorHandle self, AtenTensorHandle weight, const int64_t* kernel_size, int64_t kernel_size_len_, AtenTensorHandle* bias, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_col2im_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* output_size, int64_t output_size_len_, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* dilation, int64_t dilation_len_, const int64_t* padding, int64_t padding_len_, const int64_t* stride, int64_t stride_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_col2im(AtenTensorHandle self, const int64_t* output_size, int64_t output_size_len_, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* dilation, int64_t dilation_len_, const int64_t* padding, int64_t padding_len_, const int64_t* stride, int64_t stride_len_, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_im2col_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* dilation, int64_t dilation_len_, const int64_t* padding, int64_t padding_len_, const int64_t* stride, int64_t stride_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_im2col(AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* dilation, int64_t dilation_len_, const int64_t* padding, int64_t padding_len_, const int64_t* stride, int64_t stride_len_, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_fft_fftfreq(int64_t n, double d, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_fft_fftfreq_out(AtenTensorHandle out, int64_t n, double d);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_fft_rfftfreq(int64_t n, double d, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_fft_rfftfreq_out(AtenTensorHandle out, int64_t n, double d);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_linalg_cholesky_ex_L(AtenTensorHandle L, AtenTensorHandle info, AtenTensorHandle self, int32_t upper, int32_t check_errors);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_linalg_cross_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle other, int64_t dim);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_linalg_lu_factor_ex_out(AtenTensorHandle LU, AtenTensorHandle pivots, AtenTensorHandle info, AtenTensorHandle A, int32_t pivot, int32_t check_errors);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_linalg_lu_out(AtenTensorHandle P, AtenTensorHandle L, AtenTensorHandle U, AtenTensorHandle A, int32_t pivot);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_linalg_lu_solve_out(AtenTensorHandle out, AtenTensorHandle LU, AtenTensorHandle pivots, AtenTensorHandle B, int32_t left, int32_t adjoint);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__linalg_det_result(AtenTensorHandle result, AtenTensorHandle LU, AtenTensorHandle pivots, AtenTensorHandle A);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_linalg_ldl_factor_ex_out(AtenTensorHandle LD, AtenTensorHandle pivots, AtenTensorHandle info, AtenTensorHandle self, int32_t hermitian, int32_t check_errors);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_linalg_ldl_solve_out(AtenTensorHandle out, AtenTensorHandle LD, AtenTensorHandle pivots, AtenTensorHandle B, int32_t hermitian);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_linalg_lstsq(AtenTensorHandle self, AtenTensorHandle b, double* rcond, const char** driver, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_linalg_lstsq_out(AtenTensorHandle solution, AtenTensorHandle residuals, AtenTensorHandle rank, AtenTensorHandle singular_values, AtenTensorHandle self, AtenTensorHandle b, double* rcond, const char** driver);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_linalg_matrix_exp(AtenTensorHandle self, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__linalg_slogdet_sign(AtenTensorHandle sign, AtenTensorHandle logabsdet, AtenTensorHandle LU, AtenTensorHandle pivots, AtenTensorHandle A);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_linalg_eig(AtenTensorHandle self, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_linalg_eig_out(AtenTensorHandle eigenvalues, AtenTensorHandle eigenvectors, AtenTensorHandle self);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__linalg_eigvals(AtenTensorHandle self, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_linalg_eigvals_out(AtenTensorHandle out, AtenTensorHandle self);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__linalg_eigh_eigenvalues(AtenTensorHandle eigenvalues, AtenTensorHandle eigenvectors, AtenTensorHandle A, const char* UPLO, int32_t compute_v);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_linalg_householder_product(AtenTensorHandle input, AtenTensorHandle tau, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_linalg_householder_product_out(AtenTensorHandle out, AtenTensorHandle input, AtenTensorHandle tau);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_linalg_inv_ex_inverse(AtenTensorHandle inverse, AtenTensorHandle info, AtenTensorHandle A, int32_t check_errors);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_linalg_vector_norm_out(AtenTensorHandle out, AtenTensorHandle self, double ord, const int64_t** dim, int64_t dim_len_, int32_t keepdim, int32_t* dtype);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__linalg_svd_U(AtenTensorHandle U, AtenTensorHandle S, AtenTensorHandle Vh, AtenTensorHandle A, int32_t full_matrices, int32_t compute_uv, const char** driver);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_linalg_pinv_atol_rtol_tensor(AtenTensorHandle self, AtenTensorHandle* atol, AtenTensorHandle* rtol, int32_t hermitian, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_linalg_pinv_atol_rtol_tensor_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle* atol, AtenTensorHandle* rtol, int32_t hermitian);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__linalg_solve_ex_result(AtenTensorHandle result, AtenTensorHandle LU, AtenTensorHandle pivots, AtenTensorHandle info, AtenTensorHandle A, AtenTensorHandle B, int32_t left, int32_t check_errors);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_linalg_qr_out(AtenTensorHandle Q, AtenTensorHandle R, AtenTensorHandle A, const char* mode);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__test_parallel_materialize(AtenTensorHandle self, int64_t num_parallel, int32_t skip_first, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__test_optional_intlist(AtenTensorHandle values, const int64_t** addends, int64_t addends_len_, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__test_optional_filled_intlist(AtenTensorHandle values, const int64_t** addends, int64_t addends_len_, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__test_optional_floatlist(AtenTensorHandle values, const double** addends, int64_t addends_len_, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__test_warn_in_autograd(AtenTensorHandle self, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__test_autograd_multiple_dispatch_fullcoverage(AtenTensorHandle self, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__test_autograd_multiple_dispatch_view(AtenTensorHandle self, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__test_autograd_multiple_dispatch_view_copy(AtenTensorHandle self, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_segment_reduce(AtenTensorHandle data, const char* reduce, AtenTensorHandle* lengths, AtenTensorHandle* indices, AtenTensorHandle* offsets, int64_t axis, int32_t unsafe, double* initial, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__segment_reduce_backward(AtenTensorHandle grad, AtenTensorHandle output, AtenTensorHandle data, const char* reduce, AtenTensorHandle* lengths, AtenTensorHandle* offsets, int64_t axis, double* initial, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__nested_tensor_from_tensor_list(const AtenTensorHandle* list, int64_t list_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__fw_primal_copy(AtenTensorHandle self, int64_t level, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__make_dual_copy(AtenTensorHandle primal, AtenTensorHandle tangent, int64_t level, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_view_as_real_copy(AtenTensorHandle self, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_view_as_complex_copy(AtenTensorHandle self, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__conj_copy(AtenTensorHandle self, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__neg_view_copy(AtenTensorHandle self, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_as_strided_copy(AtenTensorHandle self, const int64_t* size, int64_t size_len_, const int64_t* stride, int64_t stride_len_, int64_t* storage_offset, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__sparse_broadcast_to_copy(AtenTensorHandle self, const int64_t* size, int64_t size_len_, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_diagonal_copy(AtenTensorHandle self, int64_t offset, int64_t dim1, int64_t dim2, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_expand_copy(AtenTensorHandle self, const int64_t* size, int64_t size_len_, int32_t implicit, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_permute_copy(AtenTensorHandle self, const int64_t* dims, int64_t dims_len_, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__reshape_alias_copy(AtenTensorHandle self, const int64_t* size, int64_t size_len_, const int64_t* stride, int64_t stride_len_, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_select_copy_int(AtenTensorHandle self, int64_t dim, int64_t index, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_detach_copy(AtenTensorHandle self, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_slice_copy_Tensor(AtenTensorHandle self, int64_t dim, int64_t* start, int64_t* end, int64_t step, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_squeeze_copy(AtenTensorHandle self, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_squeeze_copy_dim(AtenTensorHandle self, int64_t dim, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_squeeze_copy_dims(AtenTensorHandle self, const int64_t* dim, int64_t dim_len_, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_t_copy(AtenTensorHandle self, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_transpose_copy_int(AtenTensorHandle self, int64_t dim0, int64_t dim1, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_unsqueeze_copy(AtenTensorHandle self, int64_t dim, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__indices_copy(AtenTensorHandle self, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__values_copy(AtenTensorHandle self, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_indices_copy(AtenTensorHandle self, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_values_copy(AtenTensorHandle self, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_crow_indices_copy(AtenTensorHandle self, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_col_indices_copy(AtenTensorHandle self, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_ccol_indices_copy(AtenTensorHandle self, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_row_indices_copy(AtenTensorHandle self, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_unbind_copy_int_out(const AtenTensorHandle* out, int64_t out_len_, AtenTensorHandle self, int64_t dim);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_split_copy_Tensor_out(const AtenTensorHandle* out, int64_t out_len_, AtenTensorHandle self, int64_t split_size, int64_t dim);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_split_with_sizes_copy_out(const AtenTensorHandle* out, int64_t out_len_, AtenTensorHandle self, const int64_t* split_sizes, int64_t split_sizes_len_, int64_t dim);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_view_copy(AtenTensorHandle self, const int64_t* size, int64_t size_len_, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_view_copy_dtype(AtenTensorHandle self, int32_t dtype, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_unfold_copy(AtenTensorHandle self, int64_t dimension, int64_t size, int64_t step, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_alias_copy(AtenTensorHandle self, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__transformer_encoder_layer_fwd(AtenTensorHandle src, int64_t embed_dim, int64_t num_heads, AtenTensorHandle qkv_weight, AtenTensorHandle qkv_bias, AtenTensorHandle proj_weight, AtenTensorHandle proj_bias, int32_t use_gelu, int32_t norm_first, double eps, AtenTensorHandle norm_weight_1, AtenTensorHandle norm_bias_1, AtenTensorHandle norm_weight_2, AtenTensorHandle norm_bias_2, AtenTensorHandle ffn_weight_1, AtenTensorHandle ffn_bias_1, AtenTensorHandle ffn_weight_2, AtenTensorHandle ffn_bias_2, AtenTensorHandle* mask, int64_t* mask_type, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__native_multi_head_attention(AtenTensorHandle query, AtenTensorHandle key, AtenTensorHandle value, int64_t embed_dim, int64_t num_head, AtenTensorHandle qkv_weight, AtenTensorHandle qkv_bias, AtenTensorHandle proj_weight, AtenTensorHandle proj_bias, AtenTensorHandle* mask, int32_t need_weights, int32_t average_attn_weights, int64_t* mask_type, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__fused_sdp_choice(AtenTensorHandle query, AtenTensorHandle key, AtenTensorHandle value, AtenTensorHandle* attn_mask, double dropout_p, int32_t is_causal, double* scale, int64_t* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__scaled_dot_product_flash_attention_for_cpu(AtenTensorHandle query, AtenTensorHandle key, AtenTensorHandle value, double dropout_p, int32_t is_causal, AtenTensorHandle* attn_mask, double* scale, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__scaled_dot_product_flash_attention_for_cpu_backward(AtenTensorHandle grad_out, AtenTensorHandle query, AtenTensorHandle key, AtenTensorHandle value, AtenTensorHandle out, AtenTensorHandle logsumexp, double dropout_p, int32_t is_causal, AtenTensorHandle* attn_mask, double* scale, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foobar(AtenTensorHandle self, int32_t arg1, int32_t arg2, int32_t arg3, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__new_zeros_with_same_feature_meta_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle other, int64_t self_num_batch_dims);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__cudnn_ctc_loss_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle log_probs, AtenTensorHandle targets, const int64_t* input_lengths, int64_t input_lengths_len_, const int64_t* target_lengths, int64_t target_lengths_len_, int64_t blank, int32_t deterministic, int32_t zero_infinity);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__cudnn_rnn_flatten_weight_out(AtenTensorHandle out, const AtenTensorHandle* weight_arr, int64_t weight_arr_len_, int64_t weight_stride0, int64_t input_size, int64_t mode, int64_t hidden_size, int64_t proj_size, int64_t num_layers, int32_t batch_first, int32_t bidirectional);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__cudnn_rnn_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle out2, AtenTensorHandle out3, AtenTensorHandle out4, AtenTensorHandle input, const AtenTensorHandle* weight, int64_t weight_len_, int64_t weight_stride0, AtenTensorHandle* weight_buf, AtenTensorHandle hx, AtenTensorHandle* cx, int64_t mode, int64_t hidden_size, int64_t proj_size, int64_t num_layers, int32_t batch_first, double dropout, int32_t train, int32_t bidirectional, const int64_t* batch_sizes, int64_t batch_sizes_len_, AtenTensorHandle* dropout_state);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__cudnn_rnn_backward_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle out2, const AtenTensorHandle* out3, int64_t out3_len_, AtenTensorHandle input, const AtenTensorHandle* weight, int64_t weight_len_, int64_t weight_stride0, AtenTensorHandle weight_buf, AtenTensorHandle hx, AtenTensorHandle* cx, AtenTensorHandle output, AtenTensorHandle* grad_output, AtenTensorHandle* grad_hy, AtenTensorHandle* grad_cy, int64_t mode, int64_t hidden_size, int64_t proj_size, int64_t num_layers, int32_t batch_first, double dropout, int32_t train, int32_t bidirectional, const int64_t* batch_sizes, int64_t batch_sizes_len_, AtenTensorHandle* dropout_state, AtenTensorHandle reserve, const int32_t* output_mask, int64_t output_mask_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__cudnn_init_dropout_state_out(AtenTensorHandle out, double dropout, int32_t train, int64_t dropout_seed);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__masked_scale_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle mask, double scale);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_native_dropout_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle input, double p, int32_t* train);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_native_dropout_backward_out(AtenTensorHandle out, AtenTensorHandle grad_output, AtenTensorHandle mask, double scale);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__conj_physical_out(AtenTensorHandle out, AtenTensorHandle self);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__add_relu_Scalar_out(AtenTensorHandle out, AtenTensorHandle self, double other, double alpha);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_add_Scalar_out(AtenTensorHandle out, AtenTensorHandle self, double other, double alpha);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_affine_grid_generator_out(AtenTensorHandle out, AtenTensorHandle theta, const int64_t* size, int64_t size_len_, int32_t align_corners);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__test_functorch_fallback_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle other);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_bartlett_window_out(AtenTensorHandle out, int64_t window_length);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_bartlett_window_periodic_out(AtenTensorHandle out, int64_t window_length, int32_t periodic);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_quantized_batch_norm_out(AtenTensorHandle out, AtenTensorHandle input, AtenTensorHandle* weight, AtenTensorHandle* bias, AtenTensorHandle mean, AtenTensorHandle var, double eps, double output_scale, int64_t output_zero_point);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_binary_cross_entropy_with_logits_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle target, AtenTensorHandle* weight, AtenTensorHandle* pos_weight, int64_t reduction);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_bincount_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle* weights, int64_t minlength);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_blackman_window_out(AtenTensorHandle out, int64_t window_length);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_blackman_window_periodic_out(AtenTensorHandle out, int64_t window_length, int32_t periodic);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_block_diag_out(AtenTensorHandle out, const AtenTensorHandle* tensors, int64_t tensors_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_constant_pad_nd_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* pad, int64_t pad_len_, double value);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_convolution_out(AtenTensorHandle out, AtenTensorHandle input, AtenTensorHandle weight, AtenTensorHandle* bias, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t transposed, const int64_t* output_padding, int64_t output_padding_len_, int64_t groups);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_convolution_backward_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle out2, AtenTensorHandle grad_output, AtenTensorHandle input, AtenTensorHandle weight, const int64_t** bias_sizes, int64_t bias_sizes_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t transposed, const int64_t* output_padding, int64_t output_padding_len_, int64_t groups, const int32_t* output_mask, int64_t output_mask_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_convolution_overrideable_out(AtenTensorHandle out, AtenTensorHandle input, AtenTensorHandle weight, AtenTensorHandle* bias, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t transposed, const int64_t* output_padding, int64_t output_padding_len_, int64_t groups);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_convolution_backward_overrideable_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle out2, AtenTensorHandle grad_output, AtenTensorHandle input, AtenTensorHandle weight, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t transposed, const int64_t* output_padding, int64_t output_padding_len_, int64_t groups, const int32_t* output_mask, int64_t output_mask_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__convolution_out(AtenTensorHandle out, AtenTensorHandle input, AtenTensorHandle weight, AtenTensorHandle* bias, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t transposed, const int64_t* output_padding, int64_t output_padding_len_, int64_t groups, int32_t benchmark, int32_t deterministic, int32_t cudnn_enabled, int32_t allow_tf32);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_conv_tbc_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle weight, AtenTensorHandle bias, int64_t pad);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_copy_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle src, int32_t non_blocking);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__copy_from_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle dst, int32_t non_blocking);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__copy_from_and_resize_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle dst);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_count_nonzero_dim_IntList_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* dim, int64_t dim_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_count_nonzero_out(AtenTensorHandle out, AtenTensorHandle self, int64_t* dim);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_cudnn_affine_grid_generator_out(AtenTensorHandle out, AtenTensorHandle theta, int64_t N, int64_t C, int64_t H, int64_t W);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_cudnn_affine_grid_generator_backward_out(AtenTensorHandle out, AtenTensorHandle grad, int64_t N, int64_t C, int64_t H, int64_t W);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_cudnn_batch_norm_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle out2, AtenTensorHandle out3, AtenTensorHandle input, AtenTensorHandle weight, AtenTensorHandle* bias, AtenTensorHandle* running_mean, AtenTensorHandle* running_var, int32_t training, double exponential_average_factor, double epsilon);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_cudnn_batch_norm_backward_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle out2, AtenTensorHandle input, AtenTensorHandle grad_output, AtenTensorHandle weight, AtenTensorHandle* running_mean, AtenTensorHandle* running_var, AtenTensorHandle* save_mean, AtenTensorHandle* save_var, double epsilon, AtenTensorHandle reserveSpace);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_cudnn_convolution_transpose_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle weight, const int64_t* padding, int64_t padding_len_, const int64_t* output_padding, int64_t output_padding_len_, const int64_t* stride, int64_t stride_len_, const int64_t* dilation, int64_t dilation_len_, int64_t groups, int32_t benchmark, int32_t deterministic, int32_t allow_tf32);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__mps_convolution_transpose_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle weight, const int64_t* padding, int64_t padding_len_, const int64_t* output_padding, int64_t output_padding_len_, const int64_t* stride, int64_t stride_len_, const int64_t* dilation, int64_t dilation_len_, int64_t groups);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_mps_convolution_transpose_backward_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle self, AtenTensorHandle grad_output, AtenTensorHandle weight, const int64_t* padding, int64_t padding_len_, const int64_t* output_padding, int64_t output_padding_len_, const int64_t* stride, int64_t stride_len_, const int64_t* dilation, int64_t dilation_len_, int64_t groups, const int32_t* output_mask, int64_t output_mask_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_cudnn_convolution_relu_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle weight, AtenTensorHandle* bias, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int64_t groups);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_cudnn_convolution_add_relu_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle weight, AtenTensorHandle z, double* alpha, AtenTensorHandle* bias, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int64_t groups);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_cudnn_grid_sampler_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle grid);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_cudnn_grid_sampler_backward_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle self, AtenTensorHandle grid, AtenTensorHandle grad_output);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__ctc_loss_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle log_probs, AtenTensorHandle targets, const int64_t* input_lengths, int64_t input_lengths_len_, const int64_t* target_lengths, int64_t target_lengths_len_, int64_t blank, int32_t zero_infinity);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__ctc_loss_Tensor_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle log_probs, AtenTensorHandle targets, AtenTensorHandle input_lengths, AtenTensorHandle target_lengths, int64_t blank, int32_t zero_infinity);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__ctc_loss_backward_out(AtenTensorHandle out, AtenTensorHandle grad, AtenTensorHandle log_probs, AtenTensorHandle targets, const int64_t* input_lengths, int64_t input_lengths_len_, const int64_t* target_lengths, int64_t target_lengths_len_, AtenTensorHandle neg_log_likelihood, AtenTensorHandle log_alpha, int64_t blank, int32_t zero_infinity);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_diag_embed_out(AtenTensorHandle out, AtenTensorHandle self, int64_t offset, int64_t dim1, int64_t dim2);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_diagonal_backward_out(AtenTensorHandle out, AtenTensorHandle grad_output, const int64_t* input_sizes, int64_t input_sizes_len_, int64_t offset, int64_t dim1, int64_t dim2);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_div_Scalar_out(AtenTensorHandle out, AtenTensorHandle self, double other);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_div_Scalar_mode_out(AtenTensorHandle out, AtenTensorHandle self, double other, const char** rounding_mode);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_embedding_out(AtenTensorHandle out, AtenTensorHandle weight, AtenTensorHandle indices, int64_t padding_idx, int32_t scale_grad_by_freq, int32_t sparse);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_embedding_dense_backward_out(AtenTensorHandle out, AtenTensorHandle grad_output, AtenTensorHandle indices, int64_t num_weights, int64_t padding_idx, int32_t scale_grad_by_freq);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_embedding_renorm_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle indices, double max_norm, double norm_type);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_embedding_renorm(AtenTensorHandle self, AtenTensorHandle indices, double max_norm, double norm_type, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__embedding_bag_forward_only_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle out2, AtenTensorHandle out3, AtenTensorHandle weight, AtenTensorHandle indices, AtenTensorHandle offsets, int32_t scale_grad_by_freq, int64_t mode, int32_t sparse, AtenTensorHandle* per_sample_weights, int32_t include_last_offset, int64_t padding_idx);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__embedding_bag_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle out2, AtenTensorHandle out3, AtenTensorHandle weight, AtenTensorHandle indices, AtenTensorHandle offsets, int32_t scale_grad_by_freq, int64_t mode, int32_t sparse, AtenTensorHandle* per_sample_weights, int32_t include_last_offset, int64_t padding_idx);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__embedding_bag_dense_backward_out(AtenTensorHandle out, AtenTensorHandle grad, AtenTensorHandle indices, AtenTensorHandle offset2bag, AtenTensorHandle bag_size, AtenTensorHandle maximum_indices, int64_t num_weights, int32_t scale_grad_by_freq, int64_t mode, AtenTensorHandle* per_sample_weights, int64_t padding_idx);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__embedding_bag_per_sample_weights_backward_out(AtenTensorHandle out, AtenTensorHandle grad, AtenTensorHandle weight, AtenTensorHandle indices, AtenTensorHandle offsets, AtenTensorHandle offset2bag, int64_t mode, int64_t padding_idx);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_empty_permuted_out(AtenTensorHandle out, const int64_t* size, int64_t size_len_, const int64_t* physical_layout, int64_t physical_layout_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_new_empty_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* size, int64_t size_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_new_empty_strided_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* size, int64_t size_len_, const int64_t* stride, int64_t stride_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_new_full_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* size, int64_t size_len_, double fill_value);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_new_zeros_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* size, int64_t size_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_new_ones_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* size, int64_t size_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__empty_affine_quantized_out(AtenTensorHandle out, const int64_t* size, int64_t size_len_, double scale, int64_t zero_point, int32_t* memory_format);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__empty_per_channel_affine_quantized_out(AtenTensorHandle out, const int64_t* size, int64_t size_len_, AtenTensorHandle scales, AtenTensorHandle zero_points, int64_t axis, int32_t* memory_format);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_resize_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* size, int64_t size_len_, int32_t* memory_format);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_resize(AtenTensorHandle self, const int64_t* size, int64_t size_len_, int32_t* memory_format, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__resize_output_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* size, int64_t size_len_, int32_t device, int32_t device_index_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__resize_output(AtenTensorHandle self, const int64_t* size, int64_t size_len_, int32_t device, int32_t device_index_, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_empty_quantized_out(AtenTensorHandle out, const int64_t* size, int64_t size_len_, AtenTensorHandle qtensor, int32_t* memory_format);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_empty_like_out(AtenTensorHandle out, AtenTensorHandle self, int32_t* memory_format);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_empty_strided_out(AtenTensorHandle out, const int64_t* size, int64_t size_len_, const int64_t* stride, int64_t stride_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_fill_Scalar_out(AtenTensorHandle out, AtenTensorHandle self, double value);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_fill_Tensor_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle value);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_floor_divide_Scalar_out(AtenTensorHandle out, AtenTensorHandle self, double other);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_full_like_out(AtenTensorHandle out, AtenTensorHandle self, double fill_value, int32_t* memory_format);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_from_file_out(AtenTensorHandle out, const char* filename, int32_t* shared, int64_t* size);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_grid_sampler_2d_out(AtenTensorHandle out, AtenTensorHandle input, AtenTensorHandle grid, int64_t interpolation_mode, int64_t padding_mode, int32_t align_corners);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_grid_sampler_2d_backward_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle grad_output, AtenTensorHandle input, AtenTensorHandle grid, int64_t interpolation_mode, int64_t padding_mode, int32_t align_corners, const int32_t* output_mask, int64_t output_mask_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__grid_sampler_2d_cpu_fallback_out(AtenTensorHandle out, AtenTensorHandle input, AtenTensorHandle grid, int64_t interpolation_mode, int64_t padding_mode, int32_t align_corners);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_grid_sampler_3d_out(AtenTensorHandle out, AtenTensorHandle input, AtenTensorHandle grid, int64_t interpolation_mode, int64_t padding_mode, int32_t align_corners);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_grid_sampler_3d_backward_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle grad_output, AtenTensorHandle input, AtenTensorHandle grid, int64_t interpolation_mode, int64_t padding_mode, int32_t align_corners, const int32_t* output_mask, int64_t output_mask_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_hann_window_out(AtenTensorHandle out, int64_t window_length);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_hann_window_periodic_out(AtenTensorHandle out, int64_t window_length, int32_t periodic);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_hamming_window_out(AtenTensorHandle out, int64_t window_length);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_hamming_window_periodic_out(AtenTensorHandle out, int64_t window_length, int32_t periodic);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_hamming_window_periodic_alpha_out(AtenTensorHandle out, int64_t window_length, int32_t periodic, double alpha);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_hamming_window_periodic_alpha_beta_out(AtenTensorHandle out, int64_t window_length, int32_t periodic, double alpha, double beta);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_kaiser_window_out(AtenTensorHandle out, int64_t window_length);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_kaiser_window_periodic_out(AtenTensorHandle out, int64_t window_length, int32_t periodic);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_kaiser_window_beta_out(AtenTensorHandle out, int64_t window_length, int32_t periodic, double beta);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_native_group_norm_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle out2, AtenTensorHandle input, AtenTensorHandle* weight, AtenTensorHandle* bias, int64_t N, int64_t C, int64_t HxW, int64_t group, double eps);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_native_group_norm_backward_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle out2, AtenTensorHandle grad_out, AtenTensorHandle input, AtenTensorHandle mean, AtenTensorHandle rstd, AtenTensorHandle* weight, int64_t N, int64_t C, int64_t HxW, int64_t group, const int32_t* output_mask, int64_t output_mask_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_index_put_out(AtenTensorHandle out, AtenTensorHandle self, const AtenTensorHandle** indices, int64_t indices_len_, AtenTensorHandle values, int32_t accumulate);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__index_put_impl_out(AtenTensorHandle out, AtenTensorHandle self, const AtenTensorHandle** indices, int64_t indices_len_, AtenTensorHandle values, int32_t accumulate, int32_t unsafe);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__index_put_impl(AtenTensorHandle self, const AtenTensorHandle** indices, int64_t indices_len_, AtenTensorHandle values, int32_t accumulate, int32_t unsafe, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_isnan_out(AtenTensorHandle out, AtenTensorHandle self);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_native_layer_norm_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle out2, AtenTensorHandle input, const int64_t* normalized_shape, int64_t normalized_shape_len_, AtenTensorHandle* weight, AtenTensorHandle* bias, double eps);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_native_layer_norm_backward_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle out2, AtenTensorHandle grad_out, AtenTensorHandle input, const int64_t* normalized_shape, int64_t normalized_shape_len_, AtenTensorHandle mean, AtenTensorHandle rstd, AtenTensorHandle* weight, AtenTensorHandle* bias, const int32_t* output_mask, int64_t output_mask_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_linear_backward_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle out2, AtenTensorHandle self, AtenTensorHandle grad_output, AtenTensorHandle weight, const int32_t* output_mask, int64_t output_mask_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_mkldnn_linear_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle weight, AtenTensorHandle* bias);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_mkldnn_linear_backward_input_out(AtenTensorHandle out, const int64_t* input_size, int64_t input_size_len_, AtenTensorHandle grad_output, AtenTensorHandle weight);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_mkldnn_linear_backward_weights_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle grad_output, AtenTensorHandle input, AtenTensorHandle weight, int32_t bias_defined);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_mkldnn_linear_backward_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle out2, AtenTensorHandle self, AtenTensorHandle grad_output, AtenTensorHandle weight, const int32_t* output_mask, int64_t output_mask_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_matmul_backward_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle grad, AtenTensorHandle self, AtenTensorHandle other, const int32_t* mask, int64_t mask_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__aminmax_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle self);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__aminmax_dim_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle self, int64_t dim, int32_t keepdim);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_max_pool2d_backward_out(AtenTensorHandle out, AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t ceil_mode);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_mkldnn_max_pool2d_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t ceil_mode);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_mkldnn_max_pool2d_backward_out(AtenTensorHandle out, AtenTensorHandle grad_output, AtenTensorHandle output, AtenTensorHandle input, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t ceil_mode);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_mkldnn_max_pool3d_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t ceil_mode);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_mkldnn_max_pool3d_backward_out(AtenTensorHandle out, AtenTensorHandle grad_output, AtenTensorHandle output, AtenTensorHandle input, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t ceil_mode);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_quantized_max_pool1d_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t ceil_mode);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_quantized_max_pool2d_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t ceil_mode);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_quantized_max_pool3d_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t ceil_mode);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_median_out(AtenTensorHandle out, AtenTensorHandle self);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_nanmedian_out(AtenTensorHandle out, AtenTensorHandle self);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__mps_convolution_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle weight, AtenTensorHandle* bias, const int64_t* padding, int64_t padding_len_, const int64_t* stride, int64_t stride_len_, const int64_t* dilation, int64_t dilation_len_, int64_t groups);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_mps_convolution_backward_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle out2, AtenTensorHandle self, AtenTensorHandle grad_output, AtenTensorHandle weight, const int64_t* padding, int64_t padding_len_, const int64_t* stride, int64_t stride_len_, const int64_t* dilation, int64_t dilation_len_, int64_t groups, const int32_t* output_mask, int64_t output_mask_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_mkldnn_convolution_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle weight, AtenTensorHandle* bias, const int64_t* padding, int64_t padding_len_, const int64_t* stride, int64_t stride_len_, const int64_t* dilation, int64_t dilation_len_, int64_t groups);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_mkldnn_rnn_layer_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle out2, AtenTensorHandle out3, AtenTensorHandle input, AtenTensorHandle weight0, AtenTensorHandle weight1, AtenTensorHandle weight2, AtenTensorHandle weight3, AtenTensorHandle hx_, AtenTensorHandle cx_, int32_t reverse, const int64_t* batch_sizes, int64_t batch_sizes_len_, int64_t mode, int64_t hidden_size, int64_t num_layers, int32_t has_biases, int32_t bidirectional, int32_t batch_first, int32_t train);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_mkldnn_rnn_layer_backward_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle out2, AtenTensorHandle out3, AtenTensorHandle out4, AtenTensorHandle out5, AtenTensorHandle out6, AtenTensorHandle input, AtenTensorHandle weight1, AtenTensorHandle weight2, AtenTensorHandle weight3, AtenTensorHandle weight4, AtenTensorHandle hx_, AtenTensorHandle cx_tmp, AtenTensorHandle output, AtenTensorHandle hy_, AtenTensorHandle cy_, AtenTensorHandle* grad_output, AtenTensorHandle* grad_hy, AtenTensorHandle* grad_cy, int32_t reverse, int64_t mode, int64_t hidden_size, int64_t num_layers, int32_t has_biases, int32_t train, int32_t bidirectional, const int64_t* batch_sizes, int64_t batch_sizes_len_, int32_t batch_first, AtenTensorHandle workspace);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_miopen_batch_norm_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle out2, AtenTensorHandle input, AtenTensorHandle weight, AtenTensorHandle* bias, AtenTensorHandle* running_mean, AtenTensorHandle* running_var, int32_t training, double exponential_average_factor, double epsilon);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_miopen_batch_norm_backward_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle out2, AtenTensorHandle input, AtenTensorHandle grad_output, AtenTensorHandle weight, AtenTensorHandle* running_mean, AtenTensorHandle* running_var, AtenTensorHandle* save_mean, AtenTensorHandle* save_var, double epsilon);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_miopen_convolution_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle weight, AtenTensorHandle* bias, const int64_t* padding, int64_t padding_len_, const int64_t* stride, int64_t stride_len_, const int64_t* dilation, int64_t dilation_len_, int64_t groups, int32_t benchmark, int32_t deterministic);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_miopen_convolution_transpose_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle weight, AtenTensorHandle* bias, const int64_t* padding, int64_t padding_len_, const int64_t* output_padding, int64_t output_padding_len_, const int64_t* stride, int64_t stride_len_, const int64_t* dilation, int64_t dilation_len_, int64_t groups, int32_t benchmark, int32_t deterministic);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_miopen_depthwise_convolution_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle weight, AtenTensorHandle* bias, const int64_t* padding, int64_t padding_len_, const int64_t* stride, int64_t stride_len_, const int64_t* dilation, int64_t dilation_len_, int64_t groups, int32_t benchmark, int32_t deterministic);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_miopen_rnn_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle out2, AtenTensorHandle out3, AtenTensorHandle out4, AtenTensorHandle input, const AtenTensorHandle* weight, int64_t weight_len_, int64_t weight_stride0, AtenTensorHandle hx, AtenTensorHandle* cx, int64_t mode, int64_t hidden_size, int64_t num_layers, int32_t batch_first, double dropout, int32_t train, int32_t bidirectional, const int64_t* batch_sizes, int64_t batch_sizes_len_, AtenTensorHandle* dropout_state);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_miopen_rnn_backward_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle out2, const AtenTensorHandle* out3, int64_t out3_len_, AtenTensorHandle input, const AtenTensorHandle* weight, int64_t weight_len_, int64_t weight_stride0, AtenTensorHandle weight_buf, AtenTensorHandle hx, AtenTensorHandle* cx, AtenTensorHandle output, AtenTensorHandle* grad_output, AtenTensorHandle* grad_hy, AtenTensorHandle* grad_cy, int64_t mode, int64_t hidden_size, int64_t num_layers, int32_t batch_first, double dropout, int32_t train, int32_t bidirectional, const int64_t* batch_sizes, int64_t batch_sizes_len_, AtenTensorHandle* dropout_state, AtenTensorHandle reserve, const int32_t* output_mask, int64_t output_mask_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__sparse_sparse_matmul_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle other);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_mul_Scalar_out(AtenTensorHandle out, AtenTensorHandle self, double other);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__native_batch_norm_legit_functional(AtenTensorHandle input, AtenTensorHandle* weight, AtenTensorHandle* bias, AtenTensorHandle running_mean, AtenTensorHandle running_var, int32_t training, double momentum, double eps, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3, AtenTensorHandle* ret4);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__native_batch_norm_legit_no_training_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle out2, AtenTensorHandle input, AtenTensorHandle* weight, AtenTensorHandle* bias, AtenTensorHandle running_mean, AtenTensorHandle running_var, double momentum, double eps);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_batch_norm_stats_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle input, double eps);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_batch_norm_gather_stats_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle input, AtenTensorHandle mean, AtenTensorHandle invstd, AtenTensorHandle* running_mean, AtenTensorHandle* running_var, double momentum, double eps, int64_t count);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_batch_norm_gather_stats_with_counts_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle input, AtenTensorHandle mean, AtenTensorHandle invstd, AtenTensorHandle* running_mean, AtenTensorHandle* running_var, double momentum, double eps, AtenTensorHandle counts);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_native_batch_norm_backward_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle out2, AtenTensorHandle grad_out, AtenTensorHandle input, AtenTensorHandle* weight, AtenTensorHandle* running_mean, AtenTensorHandle* running_var, AtenTensorHandle* save_mean, AtenTensorHandle* save_invstd, int32_t train, double eps, const int32_t* output_mask, int64_t output_mask_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_batch_norm_backward_reduce_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle out2, AtenTensorHandle out3, AtenTensorHandle grad_out, AtenTensorHandle input, AtenTensorHandle mean, AtenTensorHandle invstd, AtenTensorHandle* weight, int32_t input_g, int32_t weight_g, int32_t bias_g);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_batch_norm_backward_elemt_out(AtenTensorHandle out, AtenTensorHandle grad_out, AtenTensorHandle input, AtenTensorHandle mean, AtenTensorHandle invstd, AtenTensorHandle* weight, AtenTensorHandle sum_dy, AtenTensorHandle sum_dy_xmu, AtenTensorHandle count);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_batch_norm_update_stats_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle input, AtenTensorHandle* running_mean, AtenTensorHandle* running_var, double momentum);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__nnpack_spatial_convolution_out(AtenTensorHandle out, AtenTensorHandle input, AtenTensorHandle weight, AtenTensorHandle* bias, const int64_t* padding, int64_t padding_len_, const int64_t* stride, int64_t stride_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_ones_like_out(AtenTensorHandle out, AtenTensorHandle self, int32_t* memory_format);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__euclidean_dist_out(AtenTensorHandle out, AtenTensorHandle x1, AtenTensorHandle x2);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__cdist_forward_out(AtenTensorHandle out, AtenTensorHandle x1, AtenTensorHandle x2, double p, int64_t* compute_mode);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__cdist_backward_out(AtenTensorHandle out, AtenTensorHandle grad, AtenTensorHandle x1, AtenTensorHandle x2, double p, AtenTensorHandle cdist);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__pdist_forward_out(AtenTensorHandle out, AtenTensorHandle self, double p);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__pdist_backward_out(AtenTensorHandle out, AtenTensorHandle grad, AtenTensorHandle self, double p, AtenTensorHandle pdist);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_pixel_shuffle_out(AtenTensorHandle out, AtenTensorHandle self, int64_t upscale_factor);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_pixel_unshuffle_out(AtenTensorHandle out, AtenTensorHandle self, int64_t downscale_factor);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_channel_shuffle_out(AtenTensorHandle out, AtenTensorHandle self, int64_t groups);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__pin_memory_out(AtenTensorHandle out, AtenTensorHandle self, int32_t* device, int32_t device_index_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_scalar_tensor_out(AtenTensorHandle out, double s);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_rand_like_out(AtenTensorHandle out, AtenTensorHandle self, int32_t* memory_format);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_randint_like_out(AtenTensorHandle out, AtenTensorHandle self, int64_t high, int32_t* memory_format);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_randint_like_low_dtype_out(AtenTensorHandle out, AtenTensorHandle self, int64_t low, int64_t high, int32_t* memory_format);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_randn_like_out(AtenTensorHandle out, AtenTensorHandle self, int32_t* memory_format);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_repeat_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* repeats, int64_t repeats_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_repeat_interleave_Tensor_out(AtenTensorHandle out, AtenTensorHandle repeats, int64_t* output_size);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__mkldnn_reshape_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* shape, int64_t shape_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_relu_out(AtenTensorHandle out, AtenTensorHandle self);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_select_backward_out(AtenTensorHandle out, AtenTensorHandle grad_output, const int64_t* input_sizes, int64_t input_sizes_len_, int64_t dim, int64_t index);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_celu_out(AtenTensorHandle out, AtenTensorHandle self, double alpha);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_slice_backward_out(AtenTensorHandle out, AtenTensorHandle grad_output, const int64_t* input_sizes, int64_t input_sizes_len_, int64_t dim, int64_t start, int64_t end, int64_t step);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_slice_scatter_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle src, int64_t dim, int64_t* start, int64_t* end, int64_t step);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_select_scatter_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle src, int64_t dim, int64_t index);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_diagonal_scatter_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle src, int64_t offset, int64_t dim1, int64_t dim2);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_as_strided_scatter_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle src, const int64_t* size, int64_t size_len_, const int64_t* stride, int64_t stride_len_, int64_t* storage_offset);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_unsafe_split_Tensor_out(const AtenTensorHandle* out, int64_t out_len_, AtenTensorHandle self, int64_t split_size, int64_t dim);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_unsafe_split_with_sizes_out(const AtenTensorHandle* out, int64_t out_len_, AtenTensorHandle self, const int64_t* split_sizes, int64_t split_sizes_len_, int64_t dim);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_sum_out(AtenTensorHandle out, AtenTensorHandle self, int32_t* dtype);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_std_mean_correction_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle self, const int64_t** dim, int64_t dim_len_, double* correction, int32_t keepdim);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_prod_out(AtenTensorHandle out, AtenTensorHandle self, int32_t* dtype);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__mkldnn_transpose_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim0, int64_t dim1);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_flip_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* dims, int64_t dims_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_roll_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* shifts, int64_t shifts_len_, const int64_t* dims, int64_t dims_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_rot90_out(AtenTensorHandle out, AtenTensorHandle self, int64_t k, const int64_t* dims, int64_t dims_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__transform_bias_rescale_qkv_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle out2, AtenTensorHandle qkv, AtenTensorHandle qkv_bias, int64_t num_heads);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__nested_tensor_from_mask_out(AtenTensorHandle out, AtenTensorHandle t, AtenTensorHandle mask, int32_t mask_check);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__nested_from_padded_out(AtenTensorHandle out, AtenTensorHandle padded, AtenTensorHandle cpu_nested_shape_example, int32_t fuse_transform_0213);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__nested_tensor_size_out(AtenTensorHandle out, AtenTensorHandle self);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__nested_tensor_strides_out(AtenTensorHandle out, AtenTensorHandle self);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__nested_tensor_storage_offsets_out(AtenTensorHandle out, AtenTensorHandle self);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__nested_from_padded_and_nested_example_out(AtenTensorHandle out, AtenTensorHandle padded, AtenTensorHandle nt_example);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__nested_view_from_buffer_copy_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle nested_size, AtenTensorHandle nested_strides, AtenTensorHandle offsets);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__nested_view_from_jagged_copy_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle offsets, AtenTensorHandle dummy, AtenTensorHandle* lengths, int64_t ragged_idx);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__nested_get_values_copy_out(AtenTensorHandle out, AtenTensorHandle self);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__trilinear_out(AtenTensorHandle out, AtenTensorHandle i1, AtenTensorHandle i2, AtenTensorHandle i3, const int64_t* expand1, int64_t expand1_len_, const int64_t* expand2, int64_t expand2_len_, const int64_t* expand3, int64_t expand3_len_, const int64_t* sumdim, int64_t sumdim_len_, int64_t unroll_dim);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__unique_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle self, int32_t sorted, int32_t return_inverse);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_unique_dim_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle out2, AtenTensorHandle self, int64_t dim, int32_t sorted, int32_t return_inverse, int32_t return_counts);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_unique_consecutive_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle out2, AtenTensorHandle self, int32_t return_inverse, int32_t return_counts, int64_t* dim);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_unique_dim_consecutive_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle out2, AtenTensorHandle self, int64_t dim, int32_t return_inverse, int32_t return_counts);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__unique2_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle out2, AtenTensorHandle self, int32_t sorted, int32_t return_inverse, int32_t return_counts);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__unsafe_view_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* size, int64_t size_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_var_mean_correction_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle self, const int64_t** dim, int64_t dim_len_, double* correction, int32_t keepdim);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__weight_norm_interface_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle v, AtenTensorHandle g, int64_t dim);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__weight_norm_interface_backward_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle grad_w, AtenTensorHandle saved_v, AtenTensorHandle saved_g, AtenTensorHandle saved_norms, int64_t dim);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__efficientzerotensor_out(AtenTensorHandle out, const int64_t* size, int64_t size_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_zeros_like_out(AtenTensorHandle out, AtenTensorHandle self, int32_t* memory_format);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__standard_gamma_grad_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle output);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__dirichlet_grad_out(AtenTensorHandle out, AtenTensorHandle x, AtenTensorHandle alpha, AtenTensorHandle total);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_native_norm_out(AtenTensorHandle out, AtenTensorHandle self, double p);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_native_norm_ScalarOpt_dim_dtype_out(AtenTensorHandle out, AtenTensorHandle self, double* p, const int64_t* dim, int64_t dim_len_, int32_t keepdim, int32_t* dtype);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__batch_norm_with_update_functional(AtenTensorHandle input, AtenTensorHandle* weight, AtenTensorHandle* bias, AtenTensorHandle running_mean, AtenTensorHandle running_var, double momentum, double eps, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3, AtenTensorHandle* ret4, AtenTensorHandle* ret5);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__batch_norm_no_update_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle out2, AtenTensorHandle out3, AtenTensorHandle input, AtenTensorHandle* weight, AtenTensorHandle* bias, AtenTensorHandle* running_mean, AtenTensorHandle* running_var, double momentum, double eps);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__sparse_sum_dim_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* dim, int64_t dim_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__sparse_sum_backward_out(AtenTensorHandle out, AtenTensorHandle grad, AtenTensorHandle self, const int64_t* dim, int64_t dim_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__sparse_csr_sum_dim_dtype_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* dim, int64_t dim_len_, int32_t keepdim, int32_t* dtype);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__sparse_csr_prod_dim_dtype_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* dim, int64_t dim_len_, int32_t keepdim, int32_t* dtype);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__sparse_softmax_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, int32_t half_to_float);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__sparse_softmax_backward_data_out(AtenTensorHandle out, AtenTensorHandle grad_output, AtenTensorHandle output, int64_t dim, AtenTensorHandle self);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__sparse_log_softmax_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, int32_t half_to_float);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__sparse_log_softmax_backward_data_out(AtenTensorHandle out, AtenTensorHandle grad_output, AtenTensorHandle output, int64_t dim, AtenTensorHandle self);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__spdiags_out(AtenTensorHandle out, AtenTensorHandle diagonals, AtenTensorHandle offsets, const int64_t* shape, int64_t shape_len_, int32_t* layout);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_norm_ScalarOpt_dtype_out(AtenTensorHandle out, AtenTensorHandle self, double* p, int32_t dtype);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_norm_Scalar_out(AtenTensorHandle out, AtenTensorHandle self, double p);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_clone_out(AtenTensorHandle out, AtenTensorHandle self, int32_t* memory_format);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_resize_as_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle the_template, int32_t* memory_format);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_resize_as(AtenTensorHandle self, AtenTensorHandle the_template, int32_t* memory_format, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_resize_as_sparse_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle the_template);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_resize_as_sparse(AtenTensorHandle self, AtenTensorHandle the_template, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_zero_out(AtenTensorHandle out, AtenTensorHandle self);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_zero(AtenTensorHandle self, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_sub_Scalar_out(AtenTensorHandle out, AtenTensorHandle self, double other, double alpha);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_rsub_Tensor_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle other, double alpha);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_rsub_Scalar_out(AtenTensorHandle out, AtenTensorHandle self, double other, double alpha);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__sparse_addmm_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle mat1, AtenTensorHandle mat2, double beta, double alpha);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_sparse_coo_tensor_size_out(AtenTensorHandle out, const int64_t* size, int64_t size_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__sparse_coo_tensor_with_dims_out(AtenTensorHandle out, int64_t sparse_dim, int64_t dense_dim, const int64_t* size, int64_t size_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__sparse_coo_tensor_with_dims_and_tensors_out(AtenTensorHandle out, int64_t sparse_dim, int64_t dense_dim, const int64_t* size, int64_t size_len_, AtenTensorHandle indices, AtenTensorHandle values, int32_t* is_coalesced);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_sparse_resize_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* size, int64_t size_len_, int64_t sparse_dim, int64_t dense_dim);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_sparse_resize(AtenTensorHandle self, const int64_t* size, int64_t size_len_, int64_t sparse_dim, int64_t dense_dim, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_sparse_resize_and_clear_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* size, int64_t size_len_, int64_t sparse_dim, int64_t dense_dim);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_sparse_resize_and_clear(AtenTensorHandle self, const int64_t* size, int64_t size_len_, int64_t sparse_dim, int64_t dense_dim, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_sparse_mask_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle mask);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__sparse_mask_projection_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle mask, int32_t accumulate_matches);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__to_dense_out(AtenTensorHandle out, AtenTensorHandle self, int32_t* dtype, int32_t* masked_grad);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__coalesce_out(AtenTensorHandle out, AtenTensorHandle self);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__coalesced_out(AtenTensorHandle out, AtenTensorHandle self, int32_t coalesced);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__coalesced(AtenTensorHandle self, int32_t coalesced, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_copy_sparse_to_sparse_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle src, int32_t non_blocking);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_copy_sparse_to_sparse(AtenTensorHandle self, AtenTensorHandle src, int32_t non_blocking, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__to_sparse_sparse_dim_out(AtenTensorHandle out, AtenTensorHandle self, int64_t sparse_dim);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__to_sparse_out(AtenTensorHandle out, AtenTensorHandle self, int32_t* layout, const int64_t** blocksize, int64_t blocksize_len_, int64_t* dense_dim);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__to_sparse_csr_out(AtenTensorHandle out, AtenTensorHandle self, int64_t* dense_dim);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__to_sparse_csc_out(AtenTensorHandle out, AtenTensorHandle self, int64_t* dense_dim);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__to_sparse_bsr_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* blocksize, int64_t blocksize_len_, int64_t* dense_dim);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__to_sparse_bsc_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* blocksize, int64_t blocksize_len_, int64_t* dense_dim);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_to_mkldnn_out(AtenTensorHandle out, AtenTensorHandle self, int32_t* dtype);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_mkldnn_reorder_conv2d_weight_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* padding, int64_t padding_len_, const int64_t* stride, int64_t stride_len_, const int64_t* dilation, int64_t dilation_len_, int64_t groups, const int64_t** input_size, int64_t input_size_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_mkldnn_reorder_conv3d_weight_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* padding, int64_t padding_len_, const int64_t* stride, int64_t stride_len_, const int64_t* dilation, int64_t dilation_len_, int64_t groups);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_quantize_per_tensor_dynamic_out(AtenTensorHandle out, AtenTensorHandle self, int32_t dtype, int32_t reduce_range);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_quantize_per_tensor_out(AtenTensorHandle out, AtenTensorHandle self, double scale, int64_t zero_point, int32_t dtype);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_quantize_per_tensor_tensor_qparams_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle scale, AtenTensorHandle zero_point, int32_t dtype);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_quantize_per_tensor_tensors_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* tensors, int64_t tensors_len_, AtenTensorHandle scales, AtenTensorHandle zero_points, int32_t dtype);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_quantize_per_channel_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle scales, AtenTensorHandle zero_points, int64_t axis, int32_t dtype);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_dequantize_self_out(AtenTensorHandle out, AtenTensorHandle self);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_dequantize_tensors_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* tensors, int64_t tensors_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_q_per_channel_scales_out(AtenTensorHandle out, AtenTensorHandle self);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_q_per_channel_zero_points_out(AtenTensorHandle out, AtenTensorHandle self);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_int_repr_out(AtenTensorHandle out, AtenTensorHandle self);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__make_per_tensor_quantized_tensor_out(AtenTensorHandle out, AtenTensorHandle self, double scale, int64_t zero_point);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__make_per_channel_quantized_tensor_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle scale, AtenTensorHandle zero_point, int64_t axis);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_fake_quantize_per_tensor_affine_cachemask_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle self, double scale, int64_t zero_point, int64_t quant_min, int64_t quant_max);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__fake_quantize_per_tensor_affine_cachemask_tensor_qparams_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle self, AtenTensorHandle scale, AtenTensorHandle zero_point, AtenTensorHandle fake_quant_enabled, int64_t quant_min, int64_t quant_max);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__fake_quantize_learnable_per_tensor_affine_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle scale, AtenTensorHandle zero_point, int64_t quant_min, int64_t quant_max, double grad_factor);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_fake_quantize_per_channel_affine_cachemask_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle self, AtenTensorHandle scale, AtenTensorHandle zero_point, int64_t axis, int64_t quant_min, int64_t quant_max);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__fake_quantize_learnable_per_channel_affine_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle scale, AtenTensorHandle zero_point, int64_t axis, int64_t quant_min, int64_t quant_max, double grad_factor);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__fused_moving_avg_obs_fq_helper_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle self, AtenTensorHandle observer_on, AtenTensorHandle fake_quant_on, AtenTensorHandle running_min, AtenTensorHandle running_max, AtenTensorHandle scale, AtenTensorHandle zero_point, double averaging_const, int64_t quant_min, int64_t quant_max, int64_t ch_axis, int32_t per_row_fake_quant, int32_t symmetric_quant);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__fused_moving_avg_obs_fq_helper_functional(AtenTensorHandle self, AtenTensorHandle observer_on, AtenTensorHandle fake_quant_on, AtenTensorHandle running_min, AtenTensorHandle running_max, AtenTensorHandle scale, AtenTensorHandle zero_point, double averaging_const, int64_t quant_min, int64_t quant_max, int64_t ch_axis, int32_t per_row_fake_quant, int32_t symmetric_quant, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3, AtenTensorHandle* ret4, AtenTensorHandle* ret5);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__to_copy_out(AtenTensorHandle out, AtenTensorHandle self, int32_t non_blocking, int32_t* memory_format);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__lstm_mps_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle out2, AtenTensorHandle out3, AtenTensorHandle out4, AtenTensorHandle out5, AtenTensorHandle input, const AtenTensorHandle* hx, int64_t hx_len_, const AtenTensorHandle* params, int64_t params_len_, int32_t has_biases, int64_t num_layers, double dropout, int32_t train, int32_t bidirectional, int32_t batch_first);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_lstm_mps_backward_out(AtenTensorHandle out0, const AtenTensorHandle* out1, int64_t out1_len_, const AtenTensorHandle* out2, int64_t out2_len_, AtenTensorHandle* grad_y, AtenTensorHandle* grad_hy, AtenTensorHandle* grad_cy, AtenTensorHandle z_state, AtenTensorHandle cell_state_fwd, AtenTensorHandle input, AtenTensorHandle layersOutputs, const AtenTensorHandle* hx, int64_t hx_len_, const AtenTensorHandle* params, int64_t params_len_, int32_t has_biases, int64_t num_layers, double dropout, int32_t train, int32_t bidirectional, int32_t batch_first);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__thnn_fused_lstm_cell_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle out2, AtenTensorHandle input_gates, AtenTensorHandle hidden_gates, AtenTensorHandle cx, AtenTensorHandle* input_bias, AtenTensorHandle* hidden_bias);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__thnn_fused_lstm_cell_backward_impl_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle out2, AtenTensorHandle* grad_hy, AtenTensorHandle* grad_cy, AtenTensorHandle cx, AtenTensorHandle cy, AtenTensorHandle workspace, int32_t has_bias);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__thnn_fused_gru_cell_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle input_gates, AtenTensorHandle hidden_gates, AtenTensorHandle hx, AtenTensorHandle* input_bias, AtenTensorHandle* hidden_bias);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__thnn_fused_gru_cell_backward_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle out2, AtenTensorHandle out3, AtenTensorHandle out4, AtenTensorHandle grad_hy, AtenTensorHandle workspace, int32_t has_bias);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__pack_padded_sequence_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle input, AtenTensorHandle lengths, int32_t batch_first);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_set_source_Tensor_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle source);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_set_source_Tensor(AtenTensorHandle self, AtenTensorHandle source, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_set_out(AtenTensorHandle out, AtenTensorHandle self);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_set(AtenTensorHandle self, AtenTensorHandle* ret0);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_lift_out(AtenTensorHandle out, AtenTensorHandle self);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_lift_fresh_copy_out(AtenTensorHandle out, AtenTensorHandle self);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_masked_fill_Scalar_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle mask, double value);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_masked_fill_Tensor_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle mask, AtenTensorHandle value);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_masked_scatter_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle mask, AtenTensorHandle source);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__masked_softmax_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle mask, int64_t* dim, int64_t* mask_type);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__masked_softmax_backward_out(AtenTensorHandle out, AtenTensorHandle grad_output, AtenTensorHandle output, AtenTensorHandle mask, int64_t* dim);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_put_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle index, AtenTensorHandle source, int32_t accumulate);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_index_fill_int_Scalar_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, AtenTensorHandle index, double value);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_index_fill_int_Tensor_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, AtenTensorHandle index, AtenTensorHandle value);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_bitwise_and_Scalar_Tensor_out(AtenTensorHandle out, double self, AtenTensorHandle other);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_bitwise_or_Scalar_Tensor_out(AtenTensorHandle out, double self, AtenTensorHandle other);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_bitwise_xor_Scalar_Tensor_out(AtenTensorHandle out, double self, AtenTensorHandle other);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu___lshift___Scalar_out(AtenTensorHandle out, AtenTensorHandle self, double other);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu___lshift___Tensor_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle other);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_bitwise_left_shift_Scalar_Tensor_out(AtenTensorHandle out, double self, AtenTensorHandle other);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu___rshift___Scalar_out(AtenTensorHandle out, AtenTensorHandle self, double other);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu___rshift___Tensor_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle other);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_bitwise_right_shift_Scalar_Tensor_out(AtenTensorHandle out, double self, AtenTensorHandle other);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_tril_indices_out(AtenTensorHandle out, int64_t row, int64_t col, int64_t offset);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_triu_indices_out(AtenTensorHandle out, int64_t row, int64_t col, int64_t offset);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_trace_out(AtenTensorHandle out, AtenTensorHandle self);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__cholesky_solve_helper_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle A, int32_t upper);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_dist_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle other, double p);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__histogramdd_bin_edges_out(const AtenTensorHandle* out, int64_t out_len_, AtenTensorHandle self, const int64_t* bins, int64_t bins_len_, const double** range, int64_t range_len_, AtenTensorHandle* weight, int32_t density);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__histogramdd_from_bin_cts_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* bins, int64_t bins_len_, const double** range, int64_t range_len_, AtenTensorHandle* weight, int32_t density);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__histogramdd_from_bin_tensors_out(AtenTensorHandle out, AtenTensorHandle self, const AtenTensorHandle* bins, int64_t bins_len_, AtenTensorHandle* weight, int32_t density);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_remainder_Scalar_Tensor_out(AtenTensorHandle out, double self, AtenTensorHandle other);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_argsort_stable_out(AtenTensorHandle out, AtenTensorHandle self, int32_t stable, int64_t dim, int32_t descending);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_unfold_backward_out(AtenTensorHandle out, AtenTensorHandle grad_in, const int64_t* input_sizes, int64_t input_sizes_len_, int64_t dim, int64_t size, int64_t step);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__amp_foreach_non_finite_check_and_unscale_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, AtenTensorHandle found_inf, AtenTensorHandle inv_scale);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__amp_update_scale_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle growth_tracker, AtenTensorHandle found_inf, double scale_growth_factor, double scale_backoff_factor, int64_t growth_interval);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__amp_update_scale(AtenTensorHandle self, AtenTensorHandle growth_tracker, AtenTensorHandle found_inf, double scale_growth_factor, double scale_backoff_factor, int64_t growth_interval, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_add_Scalar_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, double scalar);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_add_List_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* other, int64_t other_len_, double alpha);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_add_ScalarList_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, const double* scalars, int64_t scalars_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_add_Tensor_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, AtenTensorHandle other, double alpha);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_sub_Scalar_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, double scalar);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_sub_List_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* other, int64_t other_len_, double alpha);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_sub_ScalarList_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, const double* scalars, int64_t scalars_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_mul_Scalar_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, double scalar);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_mul_List_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* other, int64_t other_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_mul_ScalarList_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, const double* scalars, int64_t scalars_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_mul_Tensor_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, AtenTensorHandle other);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_div_Scalar_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, double scalar);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_div_List_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* other, int64_t other_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_div_ScalarList_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, const double* scalars, int64_t scalars_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_div_Tensor_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, AtenTensorHandle other);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_clamp_max_Scalar_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, double scalar);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_clamp_max_List_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* other, int64_t other_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_clamp_max_ScalarList_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, const double* scalars, int64_t scalars_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_clamp_min_Scalar_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, double scalar);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_clamp_min_List_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* other, int64_t other_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_clamp_min_ScalarList_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, const double* scalars, int64_t scalars_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_maximum_Scalar_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, double scalar);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_maximum_List_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* other, int64_t other_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_maximum_ScalarList_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, const double* scalars, int64_t scalars_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_minimum_Scalar_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, double scalar);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_minimum_List_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* other, int64_t other_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_minimum_ScalarList_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, const double* scalars, int64_t scalars_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_addcdiv_Scalar_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* tensor1, int64_t tensor1_len_, const AtenTensorHandle* tensor2, int64_t tensor2_len_, double value);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_addcdiv_ScalarList_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* tensor1, int64_t tensor1_len_, const AtenTensorHandle* tensor2, int64_t tensor2_len_, const double* scalars, int64_t scalars_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_addcdiv_Tensor_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* tensor1, int64_t tensor1_len_, const AtenTensorHandle* tensor2, int64_t tensor2_len_, AtenTensorHandle scalars);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_addcmul_Scalar_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* tensor1, int64_t tensor1_len_, const AtenTensorHandle* tensor2, int64_t tensor2_len_, double value);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_addcmul_ScalarList_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* tensor1, int64_t tensor1_len_, const AtenTensorHandle* tensor2, int64_t tensor2_len_, const double* scalars, int64_t scalars_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_addcmul_Tensor_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* tensor1, int64_t tensor1_len_, const AtenTensorHandle* tensor2, int64_t tensor2_len_, AtenTensorHandle scalars);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_abs_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_acos_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_asin_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_atan_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_ceil_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_cos_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_cosh_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_erf_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_erfc_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_exp_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_expm1_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_floor_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_frac_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_lerp_List_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* tensors1, int64_t tensors1_len_, const AtenTensorHandle* weights, int64_t weights_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_lerp_Scalar_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* tensors1, int64_t tensors1_len_, double weight);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_lgamma_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_log_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_log10_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_log1p_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_log2_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_neg_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_norm_Scalar_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, double ord);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_pow_List_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* exponent, int64_t exponent_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_pow_Scalar_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, double exponent);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_pow_ScalarList_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, const double* exponent, int64_t exponent_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_reciprocal_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_round_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_sigmoid_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_sign_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_sin_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_sinh_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_sqrt_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_tan_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_tanh_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_trunc_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_zero_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_copy_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* src, int64_t src_len_, int32_t non_blocking);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_bucketize_Scalar_out(AtenTensorHandle out, double self, AtenTensorHandle boundaries, int32_t out_int32, int32_t right);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_glu_jvp_out(AtenTensorHandle out, AtenTensorHandle glu, AtenTensorHandle x, AtenTensorHandle dx, int64_t dim);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_glu_backward_jvp_out(AtenTensorHandle out, AtenTensorHandle grad_x, AtenTensorHandle grad_glu, AtenTensorHandle x, AtenTensorHandle dgrad_glu, AtenTensorHandle dx, int64_t dim);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_hardswish_backward_out(AtenTensorHandle out, AtenTensorHandle grad_output, AtenTensorHandle self);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_rrelu_with_noise_backward_out(AtenTensorHandle out, AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle noise, double lower, double upper, int32_t training, int32_t self_is_result);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_mkldnn_adaptive_avg_pool2d_backward_out(AtenTensorHandle out, AtenTensorHandle grad_output, AtenTensorHandle self);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__adaptive_avg_pool2d_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* output_size, int64_t output_size_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__adaptive_avg_pool2d_backward_out(AtenTensorHandle out, AtenTensorHandle grad_output, AtenTensorHandle self);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__adaptive_avg_pool3d_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* output_size, int64_t output_size_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__adaptive_avg_pool3d_backward_out(AtenTensorHandle out, AtenTensorHandle grad_output, AtenTensorHandle self);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__slow_conv2d_backward_output_mask_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle out2, AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle weight, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int32_t* output_mask, int64_t output_mask_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_conv_depthwise3d_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle weight, const int64_t* kernel_size, int64_t kernel_size_len_, AtenTensorHandle* bias, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_slow_conv_dilated2d_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle weight, const int64_t* kernel_size, int64_t kernel_size_len_, AtenTensorHandle* bias, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_slow_conv_dilated3d_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle weight, const int64_t* kernel_size, int64_t kernel_size_len_, AtenTensorHandle* bias, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_isinf_out(AtenTensorHandle out, AtenTensorHandle self);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_linalg_matrix_exp_out(AtenTensorHandle out, AtenTensorHandle self);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__test_optional_intlist_out(AtenTensorHandle out, AtenTensorHandle values, const int64_t** addends, int64_t addends_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__test_optional_filled_intlist_out(AtenTensorHandle out, AtenTensorHandle values, const int64_t** addends, int64_t addends_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__test_optional_floatlist_out(AtenTensorHandle out, AtenTensorHandle values, const double** addends, int64_t addends_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__test_warn_in_autograd_out(AtenTensorHandle out, AtenTensorHandle self);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__test_autograd_multiple_dispatch_fullcoverage_out(AtenTensorHandle out, AtenTensorHandle self);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__test_autograd_multiple_dispatch_view_copy_out(AtenTensorHandle out, AtenTensorHandle self);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_segment_reduce_out(AtenTensorHandle out, AtenTensorHandle data, const char* reduce, AtenTensorHandle* lengths, AtenTensorHandle* indices, AtenTensorHandle* offsets, int64_t axis, int32_t unsafe, double* initial);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__segment_reduce_backward_out(AtenTensorHandle out, AtenTensorHandle grad, AtenTensorHandle output, AtenTensorHandle data, const char* reduce, AtenTensorHandle* lengths, AtenTensorHandle* offsets, int64_t axis, double* initial);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__nested_tensor_from_tensor_list_out(AtenTensorHandle out, const AtenTensorHandle* list, int64_t list_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__fw_primal_copy_out(AtenTensorHandle out, AtenTensorHandle self, int64_t level);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__make_dual_copy_out(AtenTensorHandle out, AtenTensorHandle primal, AtenTensorHandle tangent, int64_t level);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_view_as_real_copy_out(AtenTensorHandle out, AtenTensorHandle self);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_view_as_complex_copy_out(AtenTensorHandle out, AtenTensorHandle self);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__conj_copy_out(AtenTensorHandle out, AtenTensorHandle self);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__neg_view_copy_out(AtenTensorHandle out, AtenTensorHandle self);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_as_strided_copy_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* size, int64_t size_len_, const int64_t* stride, int64_t stride_len_, int64_t* storage_offset);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__sparse_broadcast_to_copy_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* size, int64_t size_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_diagonal_copy_out(AtenTensorHandle out, AtenTensorHandle self, int64_t offset, int64_t dim1, int64_t dim2);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_expand_copy_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* size, int64_t size_len_, int32_t implicit);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_permute_copy_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* dims, int64_t dims_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__reshape_alias_copy_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* size, int64_t size_len_, const int64_t* stride, int64_t stride_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_select_copy_int_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, int64_t index);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_detach_copy_out(AtenTensorHandle out, AtenTensorHandle self);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_slice_copy_Tensor_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, int64_t* start, int64_t* end, int64_t step);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_squeeze_copy_out(AtenTensorHandle out, AtenTensorHandle self);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_squeeze_copy_dim_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_squeeze_copy_dims_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* dim, int64_t dim_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_t_copy_out(AtenTensorHandle out, AtenTensorHandle self);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_transpose_copy_int_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim0, int64_t dim1);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_unsqueeze_copy_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__indices_copy_out(AtenTensorHandle out, AtenTensorHandle self);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__values_copy_out(AtenTensorHandle out, AtenTensorHandle self);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_indices_copy_out(AtenTensorHandle out, AtenTensorHandle self);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_values_copy_out(AtenTensorHandle out, AtenTensorHandle self);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_crow_indices_copy_out(AtenTensorHandle out, AtenTensorHandle self);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_col_indices_copy_out(AtenTensorHandle out, AtenTensorHandle self);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_ccol_indices_copy_out(AtenTensorHandle out, AtenTensorHandle self);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_row_indices_copy_out(AtenTensorHandle out, AtenTensorHandle self);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_view_copy_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* size, int64_t size_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_view_copy_dtype_out(AtenTensorHandle out, AtenTensorHandle self, int32_t dtype);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_unfold_copy_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dimension, int64_t size, int64_t step);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_alias_copy_out(AtenTensorHandle out, AtenTensorHandle self);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_to_padded_tensor_out(AtenTensorHandle out, AtenTensorHandle self, double padding, const int64_t** output_size, int64_t output_size_len_);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__transformer_encoder_layer_fwd_out(AtenTensorHandle out, AtenTensorHandle src, int64_t embed_dim, int64_t num_heads, AtenTensorHandle qkv_weight, AtenTensorHandle qkv_bias, AtenTensorHandle proj_weight, AtenTensorHandle proj_bias, int32_t use_gelu, int32_t norm_first, double eps, AtenTensorHandle norm_weight_1, AtenTensorHandle norm_bias_1, AtenTensorHandle norm_weight_2, AtenTensorHandle norm_bias_2, AtenTensorHandle ffn_weight_1, AtenTensorHandle ffn_bias_1, AtenTensorHandle ffn_weight_2, AtenTensorHandle ffn_bias_2, AtenTensorHandle* mask, int64_t* mask_type);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__native_multi_head_attention_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle query, AtenTensorHandle key, AtenTensorHandle value, int64_t embed_dim, int64_t num_head, AtenTensorHandle qkv_weight, AtenTensorHandle qkv_bias, AtenTensorHandle proj_weight, AtenTensorHandle proj_bias, AtenTensorHandle* mask, int32_t need_weights, int32_t average_attn_weights, int64_t* mask_type);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__triton_scaled_dot_attention_out(AtenTensorHandle out, AtenTensorHandle q, AtenTensorHandle k, AtenTensorHandle v, double dropout_p);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__triton_multi_head_attention_out(AtenTensorHandle out, AtenTensorHandle query, AtenTensorHandle key, AtenTensorHandle value, int64_t embed_dim, int64_t num_head, AtenTensorHandle qkv_weight, AtenTensorHandle qkv_bias, AtenTensorHandle proj_weight, AtenTensorHandle proj_bias, AtenTensorHandle* mask);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foobar_out(AtenTensorHandle out, AtenTensorHandle self, int32_t arg1, int32_t arg2, int32_t arg3);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__fused_adam_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* grads, int64_t grads_len_, const AtenTensorHandle* exp_avgs, int64_t exp_avgs_len_, const AtenTensorHandle* exp_avg_sqs, int64_t exp_avg_sqs_len_, const AtenTensorHandle* max_exp_avg_sqs, int64_t max_exp_avg_sqs_len_, const AtenTensorHandle* state_steps, int64_t state_steps_len_, double lr, double beta1, double beta2, double weight_decay, double eps, int32_t amsgrad, int32_t maximize, AtenTensorHandle* grad_scale, AtenTensorHandle* found_inf);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__fused_adam_tensor_lr_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* grads, int64_t grads_len_, const AtenTensorHandle* exp_avgs, int64_t exp_avgs_len_, const AtenTensorHandle* exp_avg_sqs, int64_t exp_avg_sqs_len_, const AtenTensorHandle* max_exp_avg_sqs, int64_t max_exp_avg_sqs_len_, const AtenTensorHandle* state_steps, int64_t state_steps_len_, AtenTensorHandle lr, double beta1, double beta2, double weight_decay, double eps, int32_t amsgrad, int32_t maximize, AtenTensorHandle* grad_scale, AtenTensorHandle* found_inf);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__fused_adamw_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* grads, int64_t grads_len_, const AtenTensorHandle* exp_avgs, int64_t exp_avgs_len_, const AtenTensorHandle* exp_avg_sqs, int64_t exp_avg_sqs_len_, const AtenTensorHandle* max_exp_avg_sqs, int64_t max_exp_avg_sqs_len_, const AtenTensorHandle* state_steps, int64_t state_steps_len_, double lr, double beta1, double beta2, double weight_decay, double eps, int32_t amsgrad, int32_t maximize, AtenTensorHandle* grad_scale, AtenTensorHandle* found_inf);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__fused_adamw_tensor_lr_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* grads, int64_t grads_len_, const AtenTensorHandle* exp_avgs, int64_t exp_avgs_len_, const AtenTensorHandle* exp_avg_sqs, int64_t exp_avg_sqs_len_, const AtenTensorHandle* max_exp_avg_sqs, int64_t max_exp_avg_sqs_len_, const AtenTensorHandle* state_steps, int64_t state_steps_len_, AtenTensorHandle lr, double beta1, double beta2, double weight_decay, double eps, int32_t amsgrad, int32_t maximize, AtenTensorHandle* grad_scale, AtenTensorHandle* found_inf);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__fused_sgd_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* grads, int64_t grads_len_, const AtenTensorHandle* momentum_buffer_list, int64_t momentum_buffer_list_len_, double weight_decay, double momentum, double lr, double dampening, int32_t nesterov, int32_t maximize, int32_t is_first_step, AtenTensorHandle* grad_scale, AtenTensorHandle* found_inf);
AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__fused_sgd_tensor_lr_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* grads, int64_t grads_len_, const AtenTensorHandle* momentum_buffer_list, int64_t momentum_buffer_list_len_, double weight_decay, double momentum, AtenTensorHandle lr, double dampening, int32_t nesterov, int32_t maximize, int32_t is_first_step, AtenTensorHandle* grad_scale, AtenTensorHandle* found_inf);

#ifdef __cplusplus
} // extern "C"
#endif

